library(readxl)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(patchwork)
## ---- SETUP_for_PDSA_summary ---------------------------------------------

# 1. Load updated dataset (DF2)
df <- read_excel("DF2.xlsx", col_types = "text")

# 2. Standard NA cleanup for survey-style fields
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")

survey_cols <- grep("^T[01]-", names(df), value = TRUE)

df <- df %>%
  mutate(
    across(
      all_of(survey_cols),
      ~ {
        x <- trimws(as.character(.x))
        x[x %in% na_codes] <- NA_character_
        x
      }
    )
  )

# 3. Identify T0 and T1 question columns (used everywhere later)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)

# 4. Sanity check (optional, safe to keep)
length(t0_q_cols)
## [1] 7
length(t1_q_cols)
## [1] 15
## ---- PDSA_summary_clean_v2 -----------------------------------------------

# ---- Cohort rules (per your update) ----
T0_MIN    <- 2  # "more than 1" T0 answer
T1_MIN_C1 <- 3  # "more than 2" T1 answers

TARGET_COHORT1 <- 60
TARGET_COHORT2 <- 60

# ---- Define cohorts ONCE (early) ----
df_status <- df %>%
  dplyr::mutate(
    T0_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t0_q_cols)))),
    T1_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t1_q_cols))))
  ) %>%
  dplyr::mutate(
    Cohort = dplyr::case_when(
      T0_answer_count >= T0_MIN & T1_answer_count >= T1_MIN_C1 ~ "Cohort 1",
      T0_answer_count >= T0_MIN                                 ~ "Cohort 2",
      TRUE                                                      ~ "Not enrolled"
    ),
    Enrolled = if_else(Cohort %in% c("Cohort 1", "Cohort 2"), "Enrolled", "Not enrolled")
  )

df_COHORT1  <- df_status %>% dplyr::filter(Cohort == "Cohort 1")
df_COHORT2  <- df_status %>% dplyr::filter(Cohort == "Cohort 2")
df_enrolled <- df_status %>% dplyr::filter(Enrolled == "Enrolled")

# ---- A1c columns in DF2 (confirmed) ----
a1c_base_col   <- "HbA1c Value (Last 3 Months)"
a1c_t1_col     <- "Hemoglobin A1c (T1)"
a1c_change_col <- "A1c Change (T1)"   # use if present; else compute

# ---- Safe numeric cleaner for A1c text ----
clean_a1c <- function(x) {
  x <- trimws(as.character(x))
  x[x %in% na_codes] <- NA_character_
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"
  x <- gsub("%", "", x)
  suppressWarnings(as.numeric(x))
}

# ---- Build A1c dataset (no mutate-if_else bug) ----
df_a1c <- df_enrolled %>%
  dplyr::mutate(
    A1c_baseline = clean_a1c(.data[[a1c_base_col]]),
    A1c_T1       = clean_a1c(.data[[a1c_t1_col]])
  )

if (a1c_change_col %in% names(df_enrolled)) {
  df_a1c <- df_a1c %>% dplyr::mutate(A1c_change = clean_a1c(.data[[a1c_change_col]]))
} else {
  df_a1c <- df_a1c %>% dplyr::mutate(A1c_change = A1c_T1 - A1c_baseline)
}

# ---- PDSA thresholds (±1.0%) ----
df_a1c <- df_a1c %>%
  dplyr::mutate(
    A1c_change_cat = dplyr::case_when(
      !is.na(A1c_change) & A1c_change <= -1.0 ~ "Improved (≥1.0% reduction)",
      !is.na(A1c_change) & A1c_change >=  1.0 ~ "Worsened (≥1.0% increase)",
      !is.na(A1c_change)                      ~ "No meaningful change",
      TRUE                                    ~ NA_character_
    )
  )

# ---- Counts (actuals) ----
n_total    <- nrow(df_status)
n_c1       <- nrow(df_COHORT1)
n_c2       <- nrow(df_COHORT2)
n_enrolled <- nrow(df_enrolled)

# Survey completion (relevant: T0 + T1 only)
n_T0_ge2 <- sum(df_status$T0_answer_count >= T0_MIN)
n_T1_ge1 <- sum(df_status$T1_answer_count >= 1)

# A1c availability among enrolled
n_base_a1c   <- sum(!is.na(df_a1c$A1c_baseline))
n_t1_a1c     <- sum(!is.na(df_a1c$A1c_T1))
n_change_a1c <- sum(!is.na(df_a1c$A1c_change))

# ---- Change distribution among those WITH A1c change ----
chg_tab <- df_a1c %>%
  dplyr::filter(!is.na(A1c_change_cat)) %>%
  dplyr::count(A1c_change_cat, name = "Count") %>%
  dplyr::mutate(
    Denominator = sum(Count),
    Percent = round(100 * Count / Denominator, 1)
  )

# ---- PDSA-ready summary table (targets + shortfalls + key %s) ----
pdsa_summary <- tibble::tibble(
  Metric = c(
    "Total people in dataset",
    paste0("Cohort 1 (T0≥", T0_MIN, " AND T1≥", T1_MIN_C1, ") — actual"),
    "Cohort 1 — target",
    "Cohort 1 — shortfall",
    paste0("Cohort 2 (T0≥", T0_MIN, " AND T1<", T1_MIN_C1, ") — actual"),
    "Cohort 2 — target",
    "Cohort 2 — shortfall",
    "Enrolled (Cohort 1 + Cohort 2) — actual",
    "Among enrolled: baseline HbA1c recorded",
    "Among enrolled: T1 HbA1c recorded",
    "Among enrolled: A1c Change (T1) available"
  ),
  Count = c(
    n_total,
    n_c1,
    TARGET_COHORT1,
    TARGET_COHORT1 - n_c1,
    n_c2,
    TARGET_COHORT2,
    TARGET_COHORT2 - n_c2,
    n_enrolled,
    n_base_a1c,
    n_t1_a1c,
    n_change_a1c
  ),
  Denominator = c(
    n_total,
    TARGET_COHORT1,
    TARGET_COHORT1,
    TARGET_COHORT1,
    TARGET_COHORT2,
    TARGET_COHORT2,
    TARGET_COHORT2,
    TARGET_COHORT1 + TARGET_COHORT2,
    n_enrolled,
    n_enrolled,
    n_enrolled
  )
) %>%
  dplyr::mutate(Percent = round(100 * Count / Denominator, 1))

pdsa_summary
## # A tibble: 11 × 4
##    Metric                                    Count Denominator Percent
##    <chr>                                     <dbl>       <dbl>   <dbl>
##  1 Total people in dataset                     489         489   100  
##  2 Cohort 1 (T0≥2 AND T1≥3) — actual            38          60    63.3
##  3 Cohort 1 — target                            60          60   100  
##  4 Cohort 1 — shortfall                         22          60    36.7
##  5 Cohort 2 (T0≥2 AND T1<3) — actual            23          60    38.3
##  6 Cohort 2 — target                            60          60   100  
##  7 Cohort 2 — shortfall                         37          60    61.7
##  8 Enrolled (Cohort 1 + Cohort 2) — actual      61         120    50.8
##  9 Among enrolled: baseline HbA1c recorded      61          61   100  
## 10 Among enrolled: T1 HbA1c recorded            29          61    47.5
## 11 Among enrolled: A1c Change (T1) available    29          61    47.5
chg_tab
## # A tibble: 3 × 4
##   A1c_change_cat             Count Denominator Percent
##   <chr>                      <int>       <int>   <dbl>
## 1 Improved (≥1.0% reduction)    19          29    65.5
## 2 No meaningful change           9          29    31  
## 3 Worsened (≥1.0% increase)      1          29     3.4
df_counts <- df %>%
  mutate(
    T0_answer_count = rowSums(!is.na(select(., all_of(t0_q_cols)))),
    T1_answer_count = rowSums(!is.na(select(., all_of(t1_q_cols))))
  )

# -----------------------------
# 2. Define cohorts (FINAL)
# -----------------------------
df_counts <- df_counts %>%
  mutate(
    Cohort = case_when(
      T0_answer_count >= 2 & T1_answer_count >= 3 ~ "Cohort 1 (Completed T1)",
      T0_answer_count >= 2 & T1_answer_count <= 2 ~ "Cohort 2 (No / partial T1)",
      TRUE ~ "Not enrolled"
    )
  )

# -----------------------------
# 3. Core denominators
# -----------------------------
anticipated_per_cohort <- 60

n_total      <- nrow(df_counts)
n_cohort1    <- sum(df_counts$Cohort == "Cohort 1 (Completed T1)")
n_cohort2    <- sum(df_counts$Cohort == "Cohort 2 (No / partial T1)")
n_enrolled   <- n_cohort1 + n_cohort2

# -----------------------------
# 4. A1c availability (baseline + T1)
# -----------------------------
a1c_base_col <- "HbA1c Value (Last 3 Months)"
a1c_t1_col   <- "Hemoglobin A1c (T1)"

df_enrolled <- df_counts %>%
  filter(Cohort %in% c("Cohort 1 (Completed T1)", "Cohort 2 (No / partial T1)"))

n_base_a1c <- sum(!is.na(df_enrolled[[a1c_base_col]]) &
                    trimws(df_enrolled[[a1c_base_col]]) != "")

df_cohort1 <- df_counts %>%
  filter(Cohort == "Cohort 1 (Completed T1)")

n_t1_a1c <- sum(!is.na(df_cohort1[[a1c_t1_col]]) &
                 trimws(df_cohort1[[a1c_t1_col]]) != "")

# -----------------------------
# 5. Final PDSA-aligned summary table
# -----------------------------
pdsa_summary <- tibble(
  Metric = c(
    "Anticipated enrollment per cohort",
    "Observed Cohort 1 enrollment (completed T1)",
    "Observed Cohort 2 enrollment (partial / no T1)",
    "Total enrolled (Cohort 1 + Cohort 2)",
    "Cohort 1 survey completion rate",
    "Enrolled with baseline HbA1c recorded",
    "Cohort 1 with T1 HbA1c recorded"
  ),
  Count = c(
    anticipated_per_cohort,
    n_cohort1,
    n_cohort2,
    n_enrolled,
    n_cohort1,
    n_base_a1c,
    n_t1_a1c
  ),
  Denominator = c(
    anticipated_per_cohort,
    anticipated_per_cohort,
    anticipated_per_cohort,
    anticipated_per_cohort * 2,
    anticipated_per_cohort,
    n_enrolled,
    n_cohort1
  ),
  Percent = round(100 * Count / Denominator, 1)
)

pdsa_summary
## # A tibble: 7 × 4
##   Metric                                         Count Denominator Percent
##   <chr>                                          <dbl>       <dbl>   <dbl>
## 1 Anticipated enrollment per cohort                 60          60   100  
## 2 Observed Cohort 1 enrollment (completed T1)       38          60    63.3
## 3 Observed Cohort 2 enrollment (partial / no T1)    23          60    38.3
## 4 Total enrolled (Cohort 1 + Cohort 2)              61         120    50.8
## 5 Cohort 1 survey completion rate                   38          60    63.3
## 6 Enrolled with baseline HbA1c recorded             61          61   100  
## 7 Cohort 1 with T1 HbA1c recorded                   38          38   100
#RACE ANALYSIS

race_long <- df_status %>%
  select(Enrolled, Race) %>%
  filter(!is.na(Race), Race != "") %>%
  mutate(Race = str_split(Race, "\\r\\n")) %>%
  unnest(Race) %>%
  mutate(Race = str_trim(Race)) %>%
  filter(str_detect(Race, "^R"))   # keeps only race codes

race_long <- race_long %>%
  mutate(
    Race_clean = case_when(
      str_detect(Race, "R1") ~ "American Indian / Alaska Native",
      str_detect(Race, "R2") ~ "Asian",
      str_detect(Race, "R3") ~ "Black or African American",
      str_detect(Race, "R4") ~ "Native Hawaiian / Pacific Islander",
      str_detect(Race, "R5") ~ "White",
      str_detect(Race, "R9") ~ "Other",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(Race_clean))

race_enroll_tab <- race_long %>%
  count(Enrolled, Race_clean) %>%
  group_by(Enrolled) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup()

race_enroll_tab
## # A tibble: 10 × 4
##    Enrolled     Race_clean                             n percent
##    <chr>        <chr>                              <int>   <dbl>
##  1 Enrolled     American Indian / Alaska Native        1     1.6
##  2 Enrolled     Black or African American             25    41  
##  3 Enrolled     Other                                 20    32.8
##  4 Enrolled     White                                 15    24.6
##  5 Not enrolled American Indian / Alaska Native        2     0.5
##  6 Not enrolled Asian                                  9     2.2
##  7 Not enrolled Black or African American            105    25.3
##  8 Not enrolled Native Hawaiian / Pacific Islander     1     0.2
##  9 Not enrolled Other                                101    24.3
## 10 Not enrolled White                                197    47.5
race_order <- race_enroll_tab %>%
  group_by(Race_clean) %>%
  summarize(max_pct = max(percent)) %>%
  arrange(max_pct) %>%
  pull(Race_clean)

race_enroll_tab <- race_enroll_tab %>%
  mutate(
    Race_clean = factor(Race_clean, levels = race_order),
    Enrolled = factor(Enrolled, levels = c("Not enrolled", "Enrolled"))
  )

ggplot(race_enroll_tab,
       aes(x = Race_clean, y = percent, fill = Enrolled)) +
  geom_col(position = position_dodge(width = 0.8)) +

  geom_text(
    aes(label = paste0(round(percent, 1), "%")),
    position = position_dodge(width = 0.8),
    hjust = -0.1,          # push text just outside bars
    size = 3.6,
    color = "black"
  ) +

  coord_flip(clip = "off") +   # allow labels past plot area

  scale_x_discrete(
    limits = rev(levels(race_enroll_tab$Race_clean))
  ) +

  scale_fill_manual(
    values = c(
      "Not enrolled" = "#CFE1F2",  # slightly lighter for contrast
      "Enrolled"     = "#1F4E79"
    )
  ) +

  expand_limits(y = max(race_enroll_tab$percent) + 8) +

  theme_bw() +
  labs(
    title = "Race Distribution by Enrollment Status",
    x = "Race",
    y = "% within enrollment group",
    fill = "Enrollment Status"
  ) +
  theme(
    plot.title  = element_text(face = "bold", size = 14),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10),
    legend.position = "top"
  )

###SEX ANALYSIS
sex_tab <- df_status %>%   # <- change df to df_status
  filter(!is.na(`Legal Sex`)) %>%
  group_by(Enrolled, `Legal Sex`) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Enrolled) %>%
  mutate(percent = 100 * n / sum(n))


ggplot(sex_tab,
       aes(x = `Legal Sex`, y = percent, fill = Enrolled)) +
  geom_col(position = position_dodge(width = 0.8)) +
  geom_text(aes(label = paste0(round(percent,1), "%")),
            position = position_dodge(width = 0.8),
            vjust = -0.3,
            size = 3.5) +
  scale_fill_manual(values = c(
    "Not enrolled" = "#CFE1F2",
    "Enrolled"     = "#1F4E79"
  )) +
  theme_bw() +
  labs(
    title = "Legal Sex by Enrollment Status",
    x = "Legal Sex",
    y = "% within enrollment group",
    fill = "Enrollment Status"
  )

###ETHNICITY 
## 1. Clean ethnicity into simple categories --------------------------
df_status <- df_status %>%
  mutate(
    Ethnicity_clean = dplyr::case_when(
      grepl("E1", Ethnicity) ~ "Spanish/Hispanic/Latino",
      grepl("E2", Ethnicity) ~ "Not Spanish/Hispanic/Latino",
      grepl("S1", Ethnicity) ~ "Patient declined",
      grepl("S2", Ethnicity) ~ "Patient unavailable",
      TRUE                   ~ NA_character_
    )
  )

## 2. Summarize % within each enrollment group ------------------------
eth_summary <- df_status %>%
  dplyr::filter(!is.na(Ethnicity_clean)) %>%
  dplyr::count(Enrolled, Ethnicity_clean) %>%
  dplyr::group_by(Enrolled) %>%
  dplyr::mutate(pct = 100 * n / sum(n)) %>%
  dplyr::ungroup()
## 3. Order ethnicity by overall prevalence (optional, just for nicer plot)
eth_order <- eth_summary %>%
  dplyr::group_by(Ethnicity_clean) %>%
  dplyr::summarise(overall_n = sum(n), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(overall_n)) %>%
  dplyr::pull(Ethnicity_clean)

eth_summary$Ethnicity_clean <- factor(eth_summary$Ethnicity_clean,
                                      levels = eth_order)
## 4. Plot: Ethnicity by enrollment status ----------------------------
ggplot(eth_summary,
       aes(x = Ethnicity_clean, y = pct, fill = Enrolled)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = sprintf("%.1f%%", pct)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3) +
  scale_fill_manual(values = c("Not enrolled" = "#a6cee3",
                               "Enrolled"     = "#1f78b4")) +
  labs(
    title = "Ethnicity by Enrollment Status",
    x     = "Ethnicity",
    y     = "% within enrollment group",
    fill  = "Enrollment Status"
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1, size = 9),
    plot.title  = element_text(size = 14, face = "bold")
  )

## ---- hba1c_hist_by_enrollment (DF2) ---------------------------------

# 0) Ensure df_status + Enrolled exist (uses your rule: >=3 T0 answers)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t0_q_cols <- t0_q_cols[!grepl("Contact|NOTES", t0_q_cols, ignore.case = TRUE)]

df_status <- df %>%
  mutate(
    T0_answer_count = rowSums(!is.na(dplyr::select(., all_of(t0_q_cols)))),
    Enrolled = if_else(T0_answer_count >= 3, "Enrolled", "Not enrolled")
  )

# 1) Clean HbA1c (baseline column in DF2)
clean_a1c <- function(x) {
  x <- trimws(as.character(x))
  x[x %in% c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")] <- NA_character_
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"
  x <- gsub("%", "", x)
  as.numeric(x)
}

df_status <- df_status %>%
  mutate(HbA1c = clean_a1c(`HbA1c Value (Last 3 Months)`))

# 2) Pre-bin + percent within enrollment group (stable + easy to label)
df_hist <- df_status %>%
  filter(!is.na(HbA1c), !is.na(Enrolled)) %>%
  mutate(
    Hb_bin = floor(HbA1c),
    Hb_mid = Hb_bin + 0.5
  ) %>%
  count(Enrolled, Hb_bin, Hb_mid) %>%
  group_by(Enrolled) %>%
  mutate(pct = n / sum(n) * 100) %>%
  ungroup() %>%
  mutate(
    Enrolled = factor(Enrolled, levels = c("Enrolled", "Not enrolled")),
    label = ifelse(pct > 0, paste0(round(pct, 1), "%"), "")
  )

# 3) Plot (two stacked histograms, y-limit like your example)
ggplot(df_hist, aes(x = Hb_mid, y = pct)) +
  geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
  geom_text(aes(label = label), vjust = -0.35, size = 3.5) +
  facet_wrap(~ Enrolled, ncol = 1, scales = "fixed") +
  scale_x_continuous(
    breaks = seq(min(df_hist$Hb_bin, na.rm = TRUE), max(df_hist$Hb_bin, na.rm = TRUE), by = 1),
    minor_breaks = NULL
  ) +
  coord_cartesian(ylim = c(0, 50)) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  theme_bw() +
  labs(
    title = "HbA1c Distribution by Enrollment Status",
    x = "HbA1c (%)",
    y = "% of patients within group"
  ) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.text = element_text(size = 12, face = "bold"),
    axis.text  = element_text(size = 11)
  )

### Blood sugar check frequency (T0 vs T1)

## 1. Identify the two survey columns
t0_col <- grep(
  "^T0- In the past week, how often did you check your blood sugar",
  names(df_COHORT1),
  value = TRUE
)
t1_col <- grep(
  "^T1- In the past week, how often did you check your blood sugar",
  names(df_COHORT1),
  value = TRUE
)

stopifnot(length(t0_col) == 1, length(t1_col) == 1)

t0_raw <- df_COHORT1[[t0_col]]
t1_raw <- df_COHORT1[[t1_col]]

## 2. Cleaning function: collapse equivalent strings
clean_freq <- function(x) {
  x_std <- toupper(trimws(as.character(x)))

  dplyr::case_when(
    # 1–2 times (all variants)
    x_std %in% c("1–2 TIMES", "1-2 TIMES", "1 TO 2 TIMES") ~ "1–2 times",

    # 3–4 times
    x_std %in% c("3–4 TIMES", "3-4 TIMES")                 ~ "3–4 times",

    # Daily
    x_std %in% c("DAILY", "DAILY ", "DAILY.")              ~ "Daily",

    # More than once daily
    x_std %in% c("MORE THAN DAILY",
                 "MORE THAN ONCE DAILY",
                 "MORE THAN ONCE  DAILY",
                 "MORE THAN ONCE A DAY")                   ~ "More than once daily",

    # Not at all
    x_std %in% c("NOT AT ALL")                             ~ "Not at all",

    # Anything unexpected
    TRUE                                                   ~ NA_character_
  )
}

## 3. Long data with cleaned responses
df_bs <- tibble::tibble(
  Timepoint = c(rep("T0", length(t0_raw)), rep("T1", length(t1_raw))),
  Response_raw = c(t0_raw, t1_raw)
) %>%
  mutate(Response_clean = clean_freq(Response_raw)) %>%
  filter(!is.na(Response_clean))

## 4. Summary table for plotting
freq_summary <- df_bs %>%
  count(Timepoint, Response_clean) %>%
  group_by(Timepoint) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(
    Response_clean = factor(
      Response_clean,
      levels = c("Not at all",
                 "1–2 times",
                 "3–4 times",
                 "Daily",
                 "More than once daily")
    )
  )

## 5. Plot: frequency of blood sugar checks at T0 vs T1
ggplot(freq_summary,
       aes(x = Response_clean, y = pct, fill = Timepoint)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(pct, "%")),
            position = position_dodge(width = 0.8),
            vjust = -0.3,
            size = 3.5) +
  theme_bw() +
  labs(
    title = "How Often Patients Checked Blood Sugar (T0 vs T1)",
    x     = "Frequency in past week",
    y     = "% of participants",
    fill  = "Timepoint"
  ) +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1, size = 10),
    plot.title  = element_text(size = 14, face = "bold")
  )

###blood sugar in range
sugar_levels_T0 <- c("No", "Maybe", "Yes")
sugar_levels_T1 <- c(
  "Never",
  "Sometimes",
  "About half the time",
  "Most of the time",
  "Always"
)
sugar_T0_raw <- df_COHORT1$`T0- Were your sugar levels usually in your target range last week?`
sugar_T1_raw <- df_COHORT1$`T1- Were your sugar levels usually in your target range last week?`

length(sugar_T0_raw)
## [1] 38
length(sugar_T1_raw)
## [1] 38
clean_T0 <- function(x) {
  x_std <- toupper(trimws(x))
  case_when(
    x_std == "YES"   ~ "Yes",
    x_std == "NO"    ~ "No",
    x_std == "MAYBE" ~ "Maybe",
    TRUE             ~ NA_character_
  )
}

clean_T1 <- function(x) {
  x_std <- toupper(trimws(x))
  case_when(
    x_std %in% c("NEVER") ~ "Never",
    x_std == "SOMETIMES" ~ "Sometimes",
    x_std %in% c("ABOUT HALF THE TIME", "HALF THE TIME") ~ "About half",
    x_std %in% c("MOST OF THE TIME", "YES") ~ "Most",
    x_std == "ALWAYS" ~ "Always",
    TRUE ~ NA_character_
  )
}

T0_clean <- clean_T0(sugar_T0_raw)
T1_clean <- clean_T1(sugar_T1_raw)

score_map <- c(
  "No" = 0,
  "Never" = 0,

  "Maybe" = 1,
  "Sometimes" = 1,

  "About half" = 2,

  "Yes" = 3,
  "Most" = 3,

  "Always" = 4
)

T0_score <- unname(score_map[T0_clean])
T1_score <- unname(score_map[T1_clean])

df_change <- data.frame(
  patient_id = seq_len(nrow(df_COHORT1)),
  T0_raw = T0_clean,
  T1_raw = T1_clean,
  T0_score = T0_score,
  T1_score = T1_score
) %>%
  filter(!is.na(T0_score) & !is.na(T1_score)) %>%
  mutate(
    delta = T1_score - T0_score,
    change = case_when(
      delta > 0  ~ "Increased",
      delta < 0  ~ "Decreased",
      TRUE       ~ "No change"
    )
  )

change_summary <- df_change %>%
  count(change) %>%
  mutate(percent = round(100 * n / sum(n), 1))

change_summary$change <- factor(
  change_summary$change,
  levels = c("Decreased", "No change", "Increased")
)

ggplot(change_summary, aes(x = change, y = percent, fill = change)) +
  geom_col() +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size = 4
  ) +
  scale_fill_manual(
    values = c(
      "Decreased" = "#d73027",
      "No change" = "#4E79A7",
      "Increased" = "#1a9850"
    )
  ) +
  theme_bw() +
  labs(
    title = "Change in Blood Sugar in Target Range (T0 → T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold")
  )

change_grouped_table <- df_change %>%
  count(change, T0_raw, T1_raw) %>%
  arrange(change, desc(n))

change_grouped_percent <- df_change %>%
  count(change, T0_raw, T1_raw) %>%
  group_by(change) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  arrange(change, desc(percent))

change_grouped_wide <- change_grouped_percent %>%
  mutate(label = paste0(n, " (", percent, "%)")) %>%
  select(change, T0_raw, T1_raw, label) %>%
  tidyr::pivot_wider(
    names_from  = T1_raw,
    values_from = label,
    values_fill = ""
  )
change_grouped_wide
## # A tibble: 8 × 7
##   change    T0_raw Sometimes   Never       Most        `About half` Always    
##   <chr>     <chr>  <chr>       <chr>       <chr>       <chr>        <chr>     
## 1 Decreased Yes    "2 (66.7%)" ""          ""          ""           ""        
## 2 Decreased Maybe  ""          "1 (33.3%)" ""          ""           ""        
## 3 Increased No     "6 (33.3%)" ""          "2 (11.1%)" ""           "1 (5.6%)"
## 4 Increased Maybe  ""          ""          "5 (27.8%)" "2 (11.1%)"  "1 (5.6%)"
## 5 Increased Yes    ""          ""          ""          ""           "1 (5.6%)"
## 6 No change Yes    ""          ""          "7 (63.6%)" ""           ""        
## 7 No change Maybe  "3 (27.3%)" ""          ""          ""           ""        
## 8 No change No     ""          "1 (9.1%)"  ""          ""           ""
###Confidence in blood sugar maintenance practices
confidence_levels <- c(
  "Not confident",
  "Somewhat not confident",
  "Maybe",
  "Confident",
  "Very confident"
)

# T0: Yes / Maybe / No → 1–3
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`

conf_T0_num <- dplyr::case_when(
  toupper(trimws(conf_T0_raw)) == "NO"    ~ 1,
  toupper(trimws(conf_T0_raw)) == "MAYBE" ~ 2,
  toupper(trimws(conf_T0_raw)) == "YES"   ~ 3,
  TRUE ~ NA_real_
)

# T1: 1–4 scale or text variants → 1–4
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`

conf_T1_num <- {
  x <- toupper(trimws(as.character(conf_T1_raw)))
  dplyr::case_when(
    x %in% c("1", "NOT CONFIDENT")                     ~ 1,
    x %in% c("2", "SOMEWHAT NOT CONFIDENT")            ~ 2,
    x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ 3,
    x %in% c("4", "VERY CONFIDENT")                    ~ 4,
    TRUE ~ NA_real_
  )
}

# Combine & classify change
df_conf_change <- data.frame(
  conf_T0_num = conf_T0_num,
  conf_T1_num = conf_T1_num
) %>%
  dplyr::filter(!is.na(conf_T0_num), !is.na(conf_T1_num)) %>%
  dplyr::mutate(
    change = conf_T1_num - conf_T0_num,
    ChangeCategory = dplyr::case_when(
      change > 0  ~ "Increased confidence",
      change == 0 ~ "No change",
      change < 0  ~ "Decreased confidence"
    )
  )

change_summary <- df_conf_change %>%
  dplyr::count(ChangeCategory) %>%
  dplyr::mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(n, " (", percent, "%)")
  )

ggplot(change_summary, aes(x = ChangeCategory, y = n)) +
  geom_col(fill = "#2E86C1", width = 0.6) +
  geom_text(aes(label = label),
            vjust = -0.3, size = 3.8) +
  scale_y_continuous(
    expand = expansion(mult = c(0, 0.1))
  ) +
  theme_bw() +
  labs(
    title = "Change in Confidence Making Food Choices (T0 → T1)",
    x     = "",
    y     = "Number of participants"
  ) +
  theme(
    axis.text.x  = element_text(size = 11),
    axis.title.y = element_text(size = 11),
    plot.title   = element_text(size = 14, face = "bold")
  )

# ---- T0 confidence (Yes / Maybe / No) ----
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`

conf_T0_label <- dplyr::case_when(
  toupper(trimws(conf_T0_raw)) == "NO"    ~ "No",
  toupper(trimws(conf_T0_raw)) == "MAYBE" ~ "Maybe",
  toupper(trimws(conf_T0_raw)) == "YES"   ~ "Yes",
  TRUE ~ NA_character_
)

# ---- T1 confidence (1–4 or text variants) ----
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`

conf_T1_label <- {
  x <- toupper(trimws(as.character(conf_T1_raw)))
  dplyr::case_when(
    x %in% c("1", "NOT CONFIDENT")                       ~ "Not confident",
    x %in% c("2", "SOMEWHAT NOT CONFIDENT")              ~ "Somewhat not confident",
    x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT")  ~ "Confident",
    x %in% c("4", "VERY CONFIDENT")                      ~ "Very confident",
    TRUE ~ NA_character_
  )
}


conf_transition <- data.frame(
  T0 = conf_T0_label,
  T1 = conf_T1_label,
  stringsAsFactors = FALSE
) %>%
  dplyr::filter(!is.na(T0), !is.na(T1))

conf_transition_table <- conf_transition %>%
  dplyr::count(T0, T1, name = "n") %>%
  dplyr::arrange(desc(n))

conf_transition_table
##       T0                     T1  n
## 1    Yes         Very confident 19
## 2    Yes              Confident  5
## 3  Maybe              Confident  4
## 4     No         Very confident  2
## 5    Yes Somewhat not confident  2
## 6  Maybe Somewhat not confident  1
## 7  Maybe         Very confident  1
## 8     No              Confident  1
## 9     No Somewhat not confident  1
## 10   Yes          Not confident  1
## Appointments T0 vs T1 – enrolled pts (>= 3 T0 answers, >=1 T1 answer)

# 1. Identify T0 / T1 question columns ------------------------------
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)

# 2. Define analysis cohort locally (DOES NOT overwrite df_status) --
df_appt_base <- df %>%
  mutate(
    T0_answer_count = rowSums(!is.na(dplyr::select(., all_of(t0_q_cols)))),
    T1_answer_count = rowSums(!is.na(dplyr::select(., all_of(t1_q_cols))))
  ) %>%
  filter(
    T0_answer_count >= 3,   # your enrollment rule
    T1_answer_count > 0     # must have answered something at T1
  )

# 3. Pull raw appointment columns from this filtered cohort ---------
appt_T0_raw <- df_appt_base$`T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?`
appt_T1_raw <- df_appt_base$`T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?`

# 4. Clean Yes/No responses -----------------------------------------
clean_yesno <- function(x) {
  x_std <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x_std %in% c("YES", "Y", "YS") ~ "Yes",
    x_std %in% c("NO",  "N")       ~ "No",
    TRUE                           ~ NA_character_
  )
}

appt_T0_clean <- clean_yesno(appt_T0_raw)
appt_T1_clean <- clean_yesno(appt_T1_raw)

# 5. Build paired change dataset (complete appointment data only) ---
df_appt_change <- tibble(
  patient_id = seq_len(nrow(df_appt_base)),
  T0_raw     = appt_T0_clean,
  T1_raw     = appt_T1_clean
) %>%
  filter(!is.na(T0_raw), !is.na(T1_raw))   # drops any remaining NA pairs

# 6. Score Yes/No and classify change -------------------------------
score_map <- c("No" = 0, "Yes" = 1)

df_appt_change <- df_appt_change %>%
  mutate(
    T0_score = unname(score_map[T0_raw]),
    T1_score = unname(score_map[T1_raw]),
    delta    = T1_score - T0_score,
    change   = dplyr::case_when(
      delta > 0  ~ "Increased",   # No -> Yes
      delta < 0  ~ "Decreased",   # Yes -> No
      TRUE       ~ "No change"    # Yes->Yes or No->No
    )
  )

# 7. Summary for bar chart ------------------------------------------
change_summary <- df_appt_change %>%
  count(change) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    change  = factor(change, levels = c("Decreased", "No change", "Increased"))
  )

# 8. (Optional) transition table object (not printed automatically) -
transition_table_wide <- df_appt_change %>%
  count(change, T0_raw, T1_raw) %>%
  group_by(change) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(n, " (", percent, "%)")
  ) %>%
  ungroup() %>%
  select(change, T0_raw, T1_raw, label) %>%
  pivot_wider(
    names_from  = T1_raw,
    values_from = label,
    values_fill = ""
  )
# View(transition_table_wide) if you want to inspect it

# 9. Pie-chart data (same cohort as change plot) --------------------
appt_pie_df <- df_appt_change %>%
  select(T0_raw, T1_raw) %>%
  mutate(row = row_number()) %>%
  pivot_longer(cols = c(T0_raw, T1_raw),
               names_to = "Timepoint",
               values_to = "Response") %>%
  mutate(
    Timepoint = if_else(Timepoint == "T0_raw", "T0", "T1"),
    Response  = factor(Response, levels = c("Yes", "No"))
  )

appt_pie_summary <- appt_pie_df %>%
  count(Timepoint, Response) %>%
  group_by(Timepoint) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(percent, "%\n(n=", n, ")")
  ) %>%
  ungroup()

# 10. Plots: bar of change + two pies -------------------------------
p_change <- ggplot(change_summary,
                   aes(x = change, y = percent, fill = change)) +
  geom_col() +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size = 4
  ) +
  scale_fill_manual(
    values = c(
      "Decreased" = "#d73027",
      "No change" = "#4E79A7",
      "Increased" = "#1a9850"
    )
  ) +
  scale_y_continuous(
    limits = c(0, 80),
    breaks = seq(0, 80, by = 10),
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Change in Having a Diabetes-Related Appointment (T0 → T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold")
  )


p_pies <- ggplot(appt_pie_summary,
                 aes(x = "", y = percent, fill = Response)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(
    aes(label = label),
    position = position_stack(vjust = 0.5),
    size = 3.8
  ) +
  facet_wrap(~ Timepoint) +
  scale_fill_manual(values = c("Yes" = "#1a9850", "No" = "#d73027")) +
  theme_void() +
  labs(
    title = "Appointments with Diabetes Provider at T0 and T1",
    fill  = "Response"
  ) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.text = element_text(size = 12, face = "bold")
  )

p_change / p_pies   # <- this is the only thing that prints

## ---- impact_meals_made ---------------------------------------------

meals_raw <- df_COHORT1$`T1- In the past week, how many meals did you make using food from the program?`

clean_meals <- function(x) {
  x_std <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x_std %in% c("0")            ~ "0",
    x_std %in% c("1","1 MEAL")   ~ "1",
    x_std %in% c("2 TO 3","2-3","2–3","2 TO3","2 TO 3") ~ "2–3",
    x_std %in% c("4 TO 6","4-6","4–6","4 TO 6") ~ "4–6",
    str_detect(x_std,"7")        ~ "7+",
    TRUE ~ NA_character_
  )
}

meals_clean <- clean_meals(meals_raw)

meals_summary <- tibble::tibble(meals = meals_clean) %>%
  filter(!is.na(meals)) %>%
  count(meals) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(percent, "%")
  )

ggplot(meals_summary, aes(x = meals, y = percent)) +
  geom_col(fill = "#1F4E79") +
  geom_text(aes(label = label), vjust = -0.3, size = 3.8) +
  theme_bw() +
  labs(
    title = "Meals prepared using program food (past week)",
    x     = "Number of meals",
    y     = "% of participants"
  )

## ---- impact_ease_of_use --------------------------------------------
## ---- impact_ease_of_use_final_fix ----------------------------------

ease_raw <- df_COHORT1$`T1- In the past week, how easy was it to use the food in your meals?`

ease_clean <- dplyr::case_when(
  trimws(ease_raw) == "1" ~ "Very easy",
  TRUE                   ~ str_to_sentence(trimws(ease_raw))
)

ease_summary <- tibble(response = ease_clean) %>%
  filter(!is.na(response)) %>%
  count(response) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    response = factor(
      response,
      levels = c("Neutral", "Easy", "Very easy"),
      ordered = TRUE
    )
  )

ggplot(ease_summary, aes(x = response, y = percent)) +
  geom_col(fill = "#2E86C1") +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size = 4
  ) +
  theme_bw() +
  labs(
    title = "Ease of Using Program Food in Meals (T1)",
    x     = "",
    y     = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold")
  )

## -------------------------------
## FOOD WASTE (T1 ONLY – IMPACT)
## -------------------------------

# 1. Identify the T1 waste column explicitly
waste_col <- grep(
  "^T1-.*throw away any of the food",
  names(df_COHORT1),
  value = TRUE
)

# sanity check (must be length 1)
waste_col
## [1] "T1-Did you throw away any of the food from your most recent delivery?"
stopifnot(length(waste_col) == 1)

# 2. Pull raw responses
waste_raw <- df_COHORT1[[waste_col]]

# 3. Clean Yes / No
waste_clean <- case_when(
  toupper(trimws(waste_raw)) %in% c("YES", "Y") ~ "Yes",
  toupper(trimws(waste_raw)) %in% c("NO", "N")  ~ "No",
  TRUE ~ NA_character_
)

# 4. Build summary table
waste_summary <- tibble(Response = waste_clean) %>%
  filter(!is.na(Response)) %>%
  count(Response) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label = paste0(percent, "%\n(n=", n, ")")
  )

# 5. Plot (simple, low-risk)
ggplot(waste_summary, aes(x = Response, y = percent, fill = Response)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = label), vjust = -0.3, size = 4) +
  scale_fill_manual(
    values = c("No" = "#1a9850", "Yes" = "#d73027")
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Did Patients Throw Away Delivered Food? (T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold")
  )

## ------------------------------
## Program impact on feeling in control of diabetes (T1)

# 1. Pull the T1 column
control_col <- grep(
  "^T1-How much did this program help you feel more in control of your diabetes",
  names(df_COHORT1),
  value = TRUE
)

control_raw <- df_COHORT1[[control_col]]

# 2. Clean / recode responses
control_clean <- toupper(trimws(as.character(control_raw)))

control_clean <- dplyr::case_when(
  control_clean %in% c("NOT AT ALL", "0") ~ "Not at all",
  control_clean %in% c("A LITTLE", "A LITTLE ", "1") ~ "A little",
  control_clean %in% c("SOMEWHAT", "2") ~ "Somewhat",
  control_clean %in% c("A LOT", "A LOT ", "3") ~ "A lot",
  TRUE ~ NA_character_
)

# 3. Summarize % of participants
control_summary <- tibble::tibble(response = control_clean) %>%
  dplyr::filter(!is.na(response)) %>%
  dplyr::count(response) %>%
  dplyr::mutate(
    percent  = round(100 * n / sum(n), 1),
    response = factor(response,
                      levels = c("Not at all", "A little", "Somewhat", "A lot"))
  )

# 4. Plot
ggplot(control_summary, aes(x = response, y = percent)) +
  geom_col(fill = "#4E79A7", width = 0.7) +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size  = 4
  ) +
  theme_bw() +
  labs(
    title = "Program Impact on Feeling in Control of Diabetes (T1)",
    x     = "",
    y     = "% of participants"
  ) +
  coord_cartesian(ylim = c(0, 100)) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(size = 11, angle = 30, hjust = 1),
    axis.text.y = element_text(size = 11)
  )

## ---- df2_new_impact, echo = TRUE -----------------------------------
## This chunk assumes df, df_status, df_COHORT1 are already defined
## exactly as in your current script.

## =========================
## A. Latest A1c under 8%?
## =========================

a1c_latest_summary <- df_status %>%
  filter(Enrolled == "Enrolled") %>%
  mutate(
    A1c_under8_latest = case_when(
      `A1c Under 8? (8/1/25-10/2/25)` %in% c("Y", "y", "YES", "Yes") ~ "Under 8%",
      `A1c Under 8? (8/1/25-10/2/25)` %in% c("N", "n", "NO",  "No")  ~ "8% or higher",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(A1c_under8_latest)) %>%
  count(A1c_under8_latest) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    A1c_under8_latest = factor(
      A1c_under8_latest,
      levels = c("8% or higher", "Under 8%")
    )
  )

p_a1c_latest <- ggplot(a1c_latest_summary,
                       aes(x = A1c_under8_latest, y = percent)) +
  geom_col(fill = "#4E79A7") +
  geom_text(aes(label = paste0(percent, "%")),
            vjust = -0.3, size = 3.8) +
  theme_bw() +
  labs(
    title = "Latest A1c Control (8/1/25–10/2/25) – Enrolled Patients",
    x     = "",
    y     = "% of enrolled patients"
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  )


## =========================
## B. ED/IP Utilization at T1
## =========================

util_base <- df_status %>%
  filter(Enrolled == "Enrolled") %>%
  mutate(
    ED_num = suppressWarnings(as.numeric(`ED Utilization (T1)`)),
    IP_num = suppressWarnings(as.numeric(`IP Utilization (T1)`)),
    ED_group = case_when(
      is.na(ED_num)           ~ NA_character_,
      ED_num == 0             ~ "0 ED visits",
      ED_num >= 1             ~ "≥1 ED visit"
    ),
    IP_group = case_when(
      is.na(IP_num)           ~ NA_character_,
      IP_num == 0             ~ "0 IP stays",
      IP_num >= 1             ~ "≥1 IP stay"
    )
  )

ed_summary <- util_base %>%
  filter(!is.na(ED_group)) %>%
  count(ED_group) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    ED_group = factor(ED_group, levels = c("0 ED visits", "≥1 ED visit"))
  )

ip_summary <- util_base %>%
  filter(!is.na(IP_group)) %>%
  count(IP_group) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    IP_group = factor(IP_group, levels = c("0 IP stays", "≥1 IP stay"))
  )

p_ed <- ggplot(ed_summary, aes(x = ED_group, y = percent)) +
  geom_col(fill = "#1F4E79") +
  geom_text(aes(label = paste0(percent, "%")),
            vjust = -0.3, size = 3.5) +
  theme_bw() +
  labs(
    title = "ED Utilization at T1 – Enrolled Patients",
    x     = "",
    y     = "% of enrolled patients"
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  )

p_ip <- ggplot(ip_summary, aes(x = IP_group, y = percent)) +
  geom_col(fill = "#2E86C1") +
  geom_text(aes(label = paste0(percent, "%")),
            vjust = -0.3, size = 3.5) +
  theme_bw() +
  labs(
    title = "Inpatient Utilization at T1 – Enrolled Patients",
    x     = "",
    y     = "% of enrolled patients"
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  )

## If you want them stacked vertically in one display:
(p_a1c_latest) / (p_ed | p_ip)

## =========================
## C. Social Needs / SDoH Flags
## =========================

sdoh_cols <- c(
  "Food", "Housing", "Housing 2", "Finance",
  "Transport", "Utilities", "Legal", "Violence", "Caregiver"
)

sdoh_long <- df_status %>%
  filter(Enrolled == "Enrolled") %>%
  mutate(patient_id = row_number()) %>%
  select(patient_id, all_of(sdoh_cols)) %>%
  tidyr::pivot_longer(
    cols = all_of(sdoh_cols),
    names_to = "Domain",
    values_to = "raw"
  ) %>%
  mutate(
    raw_std = toupper(trimws(as.character(raw))),
    Need = case_when(
      raw_std %in% c("Y", "YES") ~ "Need identified",
      raw_std %in% c("N", "NO")  ~ "No need",
      TRUE                       ~ NA_character_
    )
  ) %>%
  filter(!is.na(Need))

sdoh_summary <- sdoh_long %>%
  count(Domain, Need) %>%
  group_by(Domain) %>%
  mutate(
    percent = round(100 * n / sum(n), 1)
  ) %>%
  ungroup()

# Order domains by % with a "Need identified"
sdoh_order <- sdoh_summary %>%
  filter(Need == "Need identified") %>%
  arrange(desc(percent)) %>%
  pull(Domain)

sdoh_summary$Domain <- factor(sdoh_summary$Domain, levels = sdoh_order)

p_sdoh <- ggplot(sdoh_summary,
                 aes(x = Domain, y = percent, fill = Need)) +
  geom_col(position = "fill") +
  scale_y_continuous(
    labels = function(x) paste0(x * 100, "%")
  ) +
  coord_flip() +
  scale_fill_manual(
    values = c("Need identified" = "#d73027", "No need" = "#1a9850")
  ) +
  theme_bw() +
  labs(
    title = "Social Needs by Domain – Enrolled Patients",
    x     = "Domain",
    y     = "Within-domain % of patients",
    fill  = ""
  ) +
  theme(
    axis.text.y = element_text(size = 9),
    plot.title  = element_text(size = 14, face = "bold")
  )

p_sdoh

## ---- find_A1c_columns ---------------------------------------------------
## ---- A1c_change_baseline_to_T1 --------------------------------------

# 0. Quickly confirm the A1c-related columns that exist in DF2
grep("A1c|HbA1c|Hemoglobin", names(df), value = TRUE)
## [1] "HbA1c Value (Last 3 Months)"   "A1c Under 8? (8/1/25-10/2/25)"
## [3] "Hemoglobin A1c (T1)"           "A1c Change (T1)"              
## [5] "A1c Under 8? (T1)"
# 1. Helper to clean A1c values safely
clean_a1c <- function(x) {
  x <- trimws(as.character(x))

  # Treat common NA codes as NA
  x[x %in% c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")] <- NA

  # Handle inequality codes if they exist
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"

  # Strip percent signs
  x <- gsub("%", "", x)

  as.numeric(x)
}

# 2. Build data frame with baseline, T1, and change
df_a1c <- df %>%
  mutate(
    A1c_baseline = clean_a1c(`HbA1c Value (Last 3 Months)`),
    A1c_T1       = clean_a1c(`Hemoglobin A1c (T1)`),
    A1c_change   = A1c_T1 - A1c_baseline
  ) %>%
  # Keep only patients with both measurements
  filter(!is.na(A1c_baseline), !is.na(A1c_T1))

## ---- A1c_change_categorical ---------------------------------------

df_a1c_cat <- df_a1c %>%
  mutate(
    A1c_change_cat = case_when(
      A1c_change <= -1.0 ~ "Improved (≥1.0% ↓)",
      A1c_change >=  1.0 ~ "Worsened (≥1.0% ↑)",
      TRUE               ~ "No meaningful change"
    )
  )

a1c_cat_summary <- df_a1c_cat %>%
  count(A1c_change_cat) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(percent, "%")
  )

ggplot(a1c_cat_summary,
       aes(x = A1c_change_cat, y = percent, fill = A1c_change_cat)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = label), vjust = -0.3, size = 4) +
  scale_fill_manual(values = c(
    "Improved (≥1.0% ↓)"       = "#1a9850",
    "No meaningful change"     = "#4E79A7",
    "Worsened (≥1.0% ↑)"       = "#d73027"
  )) +
  theme_bw() +
  labs(
    title = "Change in HbA1c from Baseline to T1",
    x = "",
    y = "% of patients"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(size = 11)
  )