library(readxl)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(patchwork)
# =========================
# 1) Load + basic cleaning
# =========================
df <- read_excel("COHORT 1 with A1C and Hosp Util.xlsx", col_types = "text")
## New names:
## • `` -> `...50`
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")

# Clean only survey-style columns (T0/T1) + keep everything else as-is
survey_cols <- grep("^T[01]-", names(df), value = TRUE)

df <- df %>%
  mutate(
    across(
      all_of(survey_cols),
      ~ {
        x <- trimws(as.character(.x))
        x[x %in% na_codes] <- NA_character_
        x
      }
    )
  )

# Identify question columns (if you need later)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)

# =========================
# 2) Helper cleaners
# =========================
clean_yesno <- function(x) {
  x <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x %in% c("Y","YES","TRUE","1") ~ "Yes",
    x %in% c("N","NO","FALSE","0") ~ "No",
    TRUE ~ NA_character_
  )
}

clean_cohort <- function(x) {
  x <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x %in% c("1","COHORT 1","COHORT1","C1") ~ "Cohort 1",
    x %in% c("2","COHORT 2","COHORT2","C2") ~ "Cohort 2",
    TRUE ~ NA_character_
  )
}

clean_num <- function(x) {
  x <- trimws(as.character(x))
  x[x %in% na_codes] <- NA
  suppressWarnings(as.numeric(x))
}

clean_a1c <- function(x) {
  x <- trimws(as.character(x))
  x[x %in% na_codes] <- NA
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"
  x <- gsub("%", "", x)
  suppressWarnings(as.numeric(x))
}

# =========================
# 3) REQUIRED column names
# =========================
col_enrolled <- "Enrolled"
col_cohort   <- "Cohort"

# Baseline utilization (last 90 days)
col_ed_base  <- "ED Vis Last 90 Days"
col_ip_base  <- "Hosp Last 90 Days"

# T1 utilization
col_ed_t1 <- "ED Utilization (T1)"
col_ip_t1 <- "IP Utilization (T1)"

# A1c columns
col_a1c_base <- "HbA1c Value (Last 3 Months)"
col_a1c_t1   <- "Hemoglobin A1c (T1)"

needed <- c(
  col_enrolled, col_cohort,
  col_ed_base, col_ip_base, col_ed_t1, col_ip_t1,
  col_a1c_base, col_a1c_t1
)

missing <- setdiff(needed, names(df))
if (length(missing) > 0) {
  stop("Missing required columns in this dataset: ", paste(missing, collapse = ", "))
}

# =========================
# 4) Define Cohort 1 cleanly
# =========================
df_status <- df %>%
  mutate(
    Enrolled_clean = clean_yesno(.data[[col_enrolled]]),
    Cohort_clean   = clean_cohort(.data[[col_cohort]])
  ) %>%
  mutate(
    Cohort_final = case_when(
      Enrolled_clean == "Yes" & Cohort_clean == "Cohort 1" ~ "Cohort 1",
      Enrolled_clean == "Yes" & Cohort_clean == "Cohort 2" ~ "Cohort 2",
      TRUE ~ "Not enrolled"
    )
  )

df_COHORT1 <- df_status %>% filter(Cohort_final == "Cohort 1")

cat("\nCohort 1 N =", nrow(df_COHORT1), "\n")
## 
## Cohort 1 N = 60
# =========================
# 5) Utilization dataset
# =========================
df_util <- df_COHORT1 %>%
  transmute(
    ED_base = clean_num(.data[[col_ed_base]]),
    IP_base = clean_num(.data[[col_ip_base]]),
    ED_T1   = clean_num(.data[[col_ed_t1]]),
    IP_T1   = clean_num(.data[[col_ip_t1]])
  )

# Binary groups (0 vs >=1) for simple PDSA reporting
util_long <- df_util %>%
  mutate(id = row_number()) %>%
  pivot_longer(
    cols = c(ED_base, ED_T1, IP_base, IP_T1),
    names_to = "Measure",
    values_to = "Count"
  ) %>%
  mutate(
    Timepoint = case_when(
      Measure %in% c("ED_base","IP_base") ~ "Baseline (last 90d)",
      TRUE                               ~ "T1"
    ),
    Domain = case_when(
      str_detect(Measure, "^ED") ~ "ED visits",
      TRUE                       ~ "Inpatient stays"
    ),
    Group = case_when(
      is.na(Count) ~ NA_character_,
      Count == 0   ~ "0",
      Count >= 1   ~ "≥1"
    )
  ) %>%
  filter(!is.na(Group))

util_summary <- util_long %>%
  count(Domain, Timepoint, Group) %>%
  group_by(Domain, Timepoint) %>%
  mutate(
    Denominator = sum(n),
    Percent = round(100 * n / Denominator, 1),
    label = paste0(Percent, "%\n(n=", n, ")")
  ) %>%
  ungroup() %>%
  mutate(
    Group = factor(Group, levels = c("0", "≥1")),
    Timepoint = factor(Timepoint, levels = c("Baseline (last 90d)", "T1"))
  )

# =========================
# 6) Utilization plots
# =========================
p_ed_util <- util_summary %>%
  filter(Domain == "ED visits") %>%
  ggplot(aes(x = Group, y = Percent, fill = Timepoint)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(
    aes(label = label),
    position = position_dodge(width = 0.8),
    vjust = -0.3,
    size = 3.5
  ) +
  scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
  theme_bw() +
  labs(
    title = "ED Utilization: Baseline vs T1 (Cohort 1)",
    x = "",
    y = "% of Cohort 1",
    fill = ""
  ) +
  theme(plot.title = element_text(face = "bold", size = 14),
        legend.position = "top")

p_ip_util <- util_summary %>%
  filter(Domain == "Inpatient stays") %>%
  ggplot(aes(x = Group, y = Percent, fill = Timepoint)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(
    aes(label = label),
    position = position_dodge(width = 0.8),
    vjust = -0.3,
    size = 3.5
  ) +
  scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
  theme_bw() +
  labs(
    title = "Inpatient Utilization: Baseline vs T1 (Cohort 1)",
    x = "",
    y = "% of Cohort 1",
    fill = ""
  ) +
  theme(plot.title = element_text(face = "bold", size = 14),
        legend.position = "top")

p_ed_util / p_ip_util

# 1) Build paired baseline vs T1 utilization (numeric + binary)
df_util_change <- df_COHORT1 %>%
  transmute(
    id = row_number(),

    ED_base = clean_num(.data[[col_ed_base]]),
    ED_T1   = clean_num(.data[[col_ed_t1]]),

    IP_base = clean_num(.data[[col_ip_base]]),
    IP_T1   = clean_num(.data[[col_ip_t1]])
  ) %>%
  mutate(
    ED_base_bin = case_when(is.na(ED_base) ~ NA_character_, ED_base == 0 ~ "0", ED_base >= 1 ~ "≥1"),
    ED_T1_bin   = case_when(is.na(ED_T1)   ~ NA_character_, ED_T1   == 0 ~ "0", ED_T1   >= 1 ~ "≥1"),

    IP_base_bin = case_when(is.na(IP_base) ~ NA_character_, IP_base == 0 ~ "0", IP_base >= 1 ~ "≥1"),
    IP_T1_bin   = case_when(is.na(IP_T1)   ~ NA_character_, IP_T1   == 0 ~ "0", IP_T1   >= 1 ~ "≥1")
  ) %>%
  mutate(
    ED_change = case_when(
      is.na(ED_base_bin) | is.na(ED_T1_bin)            ~ "Missing",
      ED_base_bin == "≥1" & ED_T1_bin == "0"           ~ "Improved (≥1 → 0)",
      ED_base_bin == "0"  & ED_T1_bin == "≥1"          ~ "Worsened (0 → ≥1)",
      TRUE                                             ~ "No change"
    ),
    IP_change = case_when(
      is.na(IP_base_bin) | is.na(IP_T1_bin)            ~ "Missing",
      IP_base_bin == "≥1" & IP_T1_bin == "0"           ~ "Improved (≥1 → 0)",
      IP_base_bin == "0"  & IP_T1_bin == "≥1"          ~ "Worsened (0 → ≥1)",
      TRUE                                             ~ "No change"
    )
  )

# 2) Summary tables (counts + % among NON-missing pairs)
ed_change_tab <- df_util_change %>%
  filter(ED_change != "Missing") %>%
  count(ED_change, name = "Count") %>%
  mutate(
    Denominator = sum(Count),
    Percent = round(100 * Count / Denominator, 1)
  ) %>%
  arrange(match(ED_change, c("Improved (≥1 → 0)", "No change", "Worsened (0 → ≥1)")))

ip_change_tab <- df_util_change %>%
  filter(IP_change != "Missing") %>%
  count(IP_change, name = "Count") %>%
  mutate(
    Denominator = sum(Count),
    Percent = round(100 * Count / Denominator, 1)
  ) %>%
  arrange(match(IP_change, c("Improved (≥1 → 0)", "No change", "Worsened (0 → ≥1)")))

ed_change_tab
## # A tibble: 3 × 4
##   ED_change         Count Denominator Percent
##   <chr>             <int>       <int>   <dbl>
## 1 Improved (≥1 → 0)    21          60      35
## 2 No change            33          60      55
## 3 Worsened (0 → ≥1)     6          60      10
ip_change_tab
## # A tibble: 3 × 4
##   IP_change         Count Denominator Percent
##   <chr>             <int>       <int>   <dbl>
## 1 Improved (≥1 → 0)    21          60      35
## 2 No change            33          60      55
## 3 Worsened (0 → ≥1)     6          60      10
# 3) Plot data (stack ED + IP changes, percent within domain)
change_long <- bind_rows(
  df_util_change %>%
    transmute(Domain = "ED visits", Change = ED_change),
  df_util_change %>%
    transmute(Domain = "Inpatient stays", Change = IP_change)
) %>%
  filter(Change != "Missing") %>%
  count(Domain, Change, name = "n") %>%
  group_by(Domain) %>%
  mutate(
    Denominator = sum(n),
    Percent = round(100 * n / Denominator, 1),
    label = paste0(Percent, "%\n(n=", n, ")")
  ) %>%
  ungroup() %>%
  mutate(
    Change = factor(Change, levels = c("Improved (≥1 → 0)", "No change", "Worsened (0 → ≥1)")),
    Domain = factor(Domain, levels = c("ED visits", "Inpatient stays"))
  )

# 4) Change plots (one for ED, one for IP; percent scale)
p_ed_change <- change_long %>%
  filter(Domain == "ED visits") %>%
  ggplot(aes(x = Change, y = Percent)) +
  geom_col(fill = "#4E79A7") +
  geom_text(aes(label = label), vjust = -0.3, size = 3.5) +
  theme_bw() +
  labs(
    title = "ED Utilization Change: Baseline → T1 (Cohort 1)",
    x = "",
    y = "% of patients with both timepoints"
  ) +
  scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
  theme(plot.title = element_text(face = "bold", size = 14))

p_ip_change <- change_long %>%
  filter(Domain == "Inpatient stays") %>%
  ggplot(aes(x = Change, y = Percent)) +
  geom_col(fill = "#1F4E79") +
  geom_text(aes(label = label), vjust = -0.3, size = 3.5) +
  theme_bw() +
  labs(
    title = "Inpatient Utilization Change: Baseline → T1 (Cohort 1)",
    x = "",
    y = "% of patients with both timepoints"
  ) +
  scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
  theme(plot.title = element_text(face = "bold", size = 14))

p_ed_change

p_ip_change

# =========================
# 7) A1c availability + change tables (Cohort 1)
# =========================
df_a1c <- df_COHORT1 %>%
  mutate(
    A1c_baseline = clean_a1c(.data[[col_a1c_base]]),
    A1c_T1       = clean_a1c(.data[[col_a1c_t1]]),
    A1c_change   = A1c_T1 - A1c_baseline
  )

pdsa_a1c_availability <- tibble(
  Metric = c(
    "Cohort 1 enrolled patients",
    "Baseline HbA1c recorded",
    "T1 HbA1c recorded",
    "HbA1c change evaluable (baseline + T1)"
  ),
  Count = c(
    nrow(df_COHORT1),
    sum(!is.na(df_a1c$A1c_baseline)),
    sum(!is.na(df_a1c$A1c_T1)),
    sum(!is.na(df_a1c$A1c_change))
  ),
  Denominator = rep(nrow(df_COHORT1), 4)
) %>%
  mutate(Percent = round(100 * Count / Denominator, 1))

pdsa_a1c_availability
## # A tibble: 4 × 4
##   Metric                                 Count Denominator Percent
##   <chr>                                  <int>       <int>   <dbl>
## 1 Cohort 1 enrolled patients                60          60   100  
## 2 Baseline HbA1c recorded                   60          60   100  
## 3 T1 HbA1c recorded                         29          60    48.3
## 4 HbA1c change evaluable (baseline + T1)    29          60    48.3
chg_tab <- df_a1c %>%
  filter(!is.na(A1c_change)) %>%
  mutate(
    A1c_change_cat = case_when(
      A1c_change <= -1.0 ~ "Improved (≥1.0% reduction)",
      A1c_change >=  1.0 ~ "Worsened (≥1.0% increase)",
      TRUE               ~ "No meaningful change"
    )
  ) %>%
  count(A1c_change_cat, name = "Count") %>%
  mutate(
    Denominator = sum(Count),
    Percent = round(100 * Count / Denominator, 1)
  )

chg_tab
## # A tibble: 3 × 4
##   A1c_change_cat             Count Denominator Percent
##   <chr>                      <int>       <int>   <dbl>
## 1 Improved (≥1.0% reduction)    19          29    65.5
## 2 No meaningful change           9          29    31  
## 3 Worsened (≥1.0% increase)      1          29     3.4
# Optional: simple bar plot for A1c change distribution
a1c_plot_df <- chg_tab %>%
  mutate(
    A1c_change_cat = factor(
      A1c_change_cat,
      levels = c("Improved (≥1.0% reduction)", "No meaningful change", "Worsened (≥1.0% increase)")
    ),
    label = paste0(Percent, "%")
  )

ggplot(a1c_plot_df, aes(x = A1c_change_cat, y = Percent, fill = A1c_change_cat)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = label), vjust = -0.3, size = 4) +
  theme_bw() +
  labs(
    title = "HbA1c Change from Baseline to T1 (Cohort 1)",
    x = "",
    y = "% of evaluable patients"
  ) +
  scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(size = 10)
  )