library(readxl)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(patchwork)
# =========================
# 1) Load + basic cleaning
# =========================
df <- read_excel("COHORT 1 with A1C and Hosp Util.xlsx", col_types = "text")
## New names:
## • `` -> `...50`
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")
# Clean only survey-style columns (T0/T1) + keep everything else as-is
survey_cols <- grep("^T[01]-", names(df), value = TRUE)
df <- df %>%
mutate(
across(
all_of(survey_cols),
~ {
x <- trimws(as.character(.x))
x[x %in% na_codes] <- NA_character_
x
}
)
)
# Identify question columns (if you need later)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)
# =========================
# 2) Helper cleaners
# =========================
clean_yesno <- function(x) {
x <- toupper(trimws(as.character(x)))
dplyr::case_when(
x %in% c("Y","YES","TRUE","1") ~ "Yes",
x %in% c("N","NO","FALSE","0") ~ "No",
TRUE ~ NA_character_
)
}
clean_cohort <- function(x) {
x <- toupper(trimws(as.character(x)))
dplyr::case_when(
x %in% c("1","COHORT 1","COHORT1","C1") ~ "Cohort 1",
x %in% c("2","COHORT 2","COHORT2","C2") ~ "Cohort 2",
TRUE ~ NA_character_
)
}
clean_num <- function(x) {
x <- trimws(as.character(x))
x[x %in% na_codes] <- NA
suppressWarnings(as.numeric(x))
}
clean_a1c <- function(x) {
x <- trimws(as.character(x))
x[x %in% na_codes] <- NA
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
x <- gsub("%", "", x)
suppressWarnings(as.numeric(x))
}
# =========================
# 3) REQUIRED column names
# =========================
col_enrolled <- "Enrolled"
col_cohort <- "Cohort"
# Baseline utilization (last 90 days)
col_ed_base <- "ED Vis Last 90 Days"
col_ip_base <- "Hosp Last 90 Days"
# T1 utilization
col_ed_t1 <- "ED Utilization (T1)"
col_ip_t1 <- "IP Utilization (T1)"
# A1c columns
col_a1c_base <- "HbA1c Value (Last 3 Months)"
col_a1c_t1 <- "Hemoglobin A1c (T1)"
needed <- c(
col_enrolled, col_cohort,
col_ed_base, col_ip_base, col_ed_t1, col_ip_t1,
col_a1c_base, col_a1c_t1
)
missing <- setdiff(needed, names(df))
if (length(missing) > 0) {
stop("Missing required columns in this dataset: ", paste(missing, collapse = ", "))
}
# =========================
# 4) Define Cohort 1 cleanly
# =========================
df_status <- df %>%
mutate(
Enrolled_clean = clean_yesno(.data[[col_enrolled]]),
Cohort_clean = clean_cohort(.data[[col_cohort]])
) %>%
mutate(
Cohort_final = case_when(
Enrolled_clean == "Yes" & Cohort_clean == "Cohort 1" ~ "Cohort 1",
Enrolled_clean == "Yes" & Cohort_clean == "Cohort 2" ~ "Cohort 2",
TRUE ~ "Not enrolled"
)
)
df_COHORT1 <- df_status %>% filter(Cohort_final == "Cohort 1")
cat("\nCohort 1 N =", nrow(df_COHORT1), "\n")
##
## Cohort 1 N = 60
# =========================
# 5) Utilization dataset
# =========================
df_util <- df_COHORT1 %>%
transmute(
ED_base = clean_num(.data[[col_ed_base]]),
IP_base = clean_num(.data[[col_ip_base]]),
ED_T1 = clean_num(.data[[col_ed_t1]]),
IP_T1 = clean_num(.data[[col_ip_t1]])
)
# Binary groups (0 vs >=1) for simple PDSA reporting
util_long <- df_util %>%
mutate(id = row_number()) %>%
pivot_longer(
cols = c(ED_base, ED_T1, IP_base, IP_T1),
names_to = "Measure",
values_to = "Count"
) %>%
mutate(
Timepoint = case_when(
Measure %in% c("ED_base","IP_base") ~ "Baseline (last 90d)",
TRUE ~ "T1"
),
Domain = case_when(
str_detect(Measure, "^ED") ~ "ED visits",
TRUE ~ "Inpatient stays"
),
Group = case_when(
is.na(Count) ~ NA_character_,
Count == 0 ~ "0",
Count >= 1 ~ "≥1"
)
) %>%
filter(!is.na(Group))
util_summary <- util_long %>%
count(Domain, Timepoint, Group) %>%
group_by(Domain, Timepoint) %>%
mutate(
Denominator = sum(n),
Percent = round(100 * n / Denominator, 1),
label = paste0(Percent, "%\n(n=", n, ")")
) %>%
ungroup() %>%
mutate(
Group = factor(Group, levels = c("0", "≥1")),
Timepoint = factor(Timepoint, levels = c("Baseline (last 90d)", "T1"))
)
# =========================
# 6) Utilization plots
# =========================
p_ed_util <- util_summary %>%
filter(Domain == "ED visits") %>%
ggplot(aes(x = Group, y = Percent, fill = Timepoint)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(
aes(label = label),
position = position_dodge(width = 0.8),
vjust = -0.3,
size = 3.5
) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = "ED Utilization: Baseline vs T1 (Cohort 1)",
x = "",
y = "% of Cohort 1",
fill = ""
) +
theme(plot.title = element_text(face = "bold", size = 14),
legend.position = "top")
p_ip_util <- util_summary %>%
filter(Domain == "Inpatient stays") %>%
ggplot(aes(x = Group, y = Percent, fill = Timepoint)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(
aes(label = label),
position = position_dodge(width = 0.8),
vjust = -0.3,
size = 3.5
) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = "Inpatient Utilization: Baseline vs T1 (Cohort 1)",
x = "",
y = "% of Cohort 1",
fill = ""
) +
theme(plot.title = element_text(face = "bold", size = 14),
legend.position = "top")
p_ed_util / p_ip_util

# 1) Build paired baseline vs T1 utilization (numeric + binary)
df_util_change <- df_COHORT1 %>%
transmute(
id = row_number(),
ED_base = clean_num(.data[[col_ed_base]]),
ED_T1 = clean_num(.data[[col_ed_t1]]),
IP_base = clean_num(.data[[col_ip_base]]),
IP_T1 = clean_num(.data[[col_ip_t1]])
) %>%
mutate(
ED_base_bin = case_when(is.na(ED_base) ~ NA_character_, ED_base == 0 ~ "0", ED_base >= 1 ~ "≥1"),
ED_T1_bin = case_when(is.na(ED_T1) ~ NA_character_, ED_T1 == 0 ~ "0", ED_T1 >= 1 ~ "≥1"),
IP_base_bin = case_when(is.na(IP_base) ~ NA_character_, IP_base == 0 ~ "0", IP_base >= 1 ~ "≥1"),
IP_T1_bin = case_when(is.na(IP_T1) ~ NA_character_, IP_T1 == 0 ~ "0", IP_T1 >= 1 ~ "≥1")
) %>%
mutate(
ED_change = case_when(
is.na(ED_base_bin) | is.na(ED_T1_bin) ~ "Missing",
ED_base_bin == "≥1" & ED_T1_bin == "0" ~ "Improved (≥1 → 0)",
ED_base_bin == "0" & ED_T1_bin == "≥1" ~ "Worsened (0 → ≥1)",
TRUE ~ "No change"
),
IP_change = case_when(
is.na(IP_base_bin) | is.na(IP_T1_bin) ~ "Missing",
IP_base_bin == "≥1" & IP_T1_bin == "0" ~ "Improved (≥1 → 0)",
IP_base_bin == "0" & IP_T1_bin == "≥1" ~ "Worsened (0 → ≥1)",
TRUE ~ "No change"
)
)
# 2) Summary tables (counts + % among NON-missing pairs)
ed_change_tab <- df_util_change %>%
filter(ED_change != "Missing") %>%
count(ED_change, name = "Count") %>%
mutate(
Denominator = sum(Count),
Percent = round(100 * Count / Denominator, 1)
) %>%
arrange(match(ED_change, c("Improved (≥1 → 0)", "No change", "Worsened (0 → ≥1)")))
ip_change_tab <- df_util_change %>%
filter(IP_change != "Missing") %>%
count(IP_change, name = "Count") %>%
mutate(
Denominator = sum(Count),
Percent = round(100 * Count / Denominator, 1)
) %>%
arrange(match(IP_change, c("Improved (≥1 → 0)", "No change", "Worsened (0 → ≥1)")))
ed_change_tab
## # A tibble: 3 × 4
## ED_change Count Denominator Percent
## <chr> <int> <int> <dbl>
## 1 Improved (≥1 → 0) 21 60 35
## 2 No change 33 60 55
## 3 Worsened (0 → ≥1) 6 60 10
ip_change_tab
## # A tibble: 3 × 4
## IP_change Count Denominator Percent
## <chr> <int> <int> <dbl>
## 1 Improved (≥1 → 0) 21 60 35
## 2 No change 33 60 55
## 3 Worsened (0 → ≥1) 6 60 10
# 3) Plot data (stack ED + IP changes, percent within domain)
change_long <- bind_rows(
df_util_change %>%
transmute(Domain = "ED visits", Change = ED_change),
df_util_change %>%
transmute(Domain = "Inpatient stays", Change = IP_change)
) %>%
filter(Change != "Missing") %>%
count(Domain, Change, name = "n") %>%
group_by(Domain) %>%
mutate(
Denominator = sum(n),
Percent = round(100 * n / Denominator, 1),
label = paste0(Percent, "%\n(n=", n, ")")
) %>%
ungroup() %>%
mutate(
Change = factor(Change, levels = c("Improved (≥1 → 0)", "No change", "Worsened (0 → ≥1)")),
Domain = factor(Domain, levels = c("ED visits", "Inpatient stays"))
)
# 4) Change plots (one for ED, one for IP; percent scale)
p_ed_change <- change_long %>%
filter(Domain == "ED visits") %>%
ggplot(aes(x = Change, y = Percent)) +
geom_col(fill = "#4E79A7") +
geom_text(aes(label = label), vjust = -0.3, size = 3.5) +
theme_bw() +
labs(
title = "ED Utilization Change: Baseline → T1 (Cohort 1)",
x = "",
y = "% of patients with both timepoints"
) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme(plot.title = element_text(face = "bold", size = 14))
p_ip_change <- change_long %>%
filter(Domain == "Inpatient stays") %>%
ggplot(aes(x = Change, y = Percent)) +
geom_col(fill = "#1F4E79") +
geom_text(aes(label = label), vjust = -0.3, size = 3.5) +
theme_bw() +
labs(
title = "Inpatient Utilization Change: Baseline → T1 (Cohort 1)",
x = "",
y = "% of patients with both timepoints"
) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme(plot.title = element_text(face = "bold", size = 14))
p_ed_change

p_ip_change

# =========================
# 7) A1c availability + change tables (Cohort 1)
# =========================
df_a1c <- df_COHORT1 %>%
mutate(
A1c_baseline = clean_a1c(.data[[col_a1c_base]]),
A1c_T1 = clean_a1c(.data[[col_a1c_t1]]),
A1c_change = A1c_T1 - A1c_baseline
)
pdsa_a1c_availability <- tibble(
Metric = c(
"Cohort 1 enrolled patients",
"Baseline HbA1c recorded",
"T1 HbA1c recorded",
"HbA1c change evaluable (baseline + T1)"
),
Count = c(
nrow(df_COHORT1),
sum(!is.na(df_a1c$A1c_baseline)),
sum(!is.na(df_a1c$A1c_T1)),
sum(!is.na(df_a1c$A1c_change))
),
Denominator = rep(nrow(df_COHORT1), 4)
) %>%
mutate(Percent = round(100 * Count / Denominator, 1))
pdsa_a1c_availability
## # A tibble: 4 × 4
## Metric Count Denominator Percent
## <chr> <int> <int> <dbl>
## 1 Cohort 1 enrolled patients 60 60 100
## 2 Baseline HbA1c recorded 60 60 100
## 3 T1 HbA1c recorded 29 60 48.3
## 4 HbA1c change evaluable (baseline + T1) 29 60 48.3
chg_tab <- df_a1c %>%
filter(!is.na(A1c_change)) %>%
mutate(
A1c_change_cat = case_when(
A1c_change <= -1.0 ~ "Improved (≥1.0% reduction)",
A1c_change >= 1.0 ~ "Worsened (≥1.0% increase)",
TRUE ~ "No meaningful change"
)
) %>%
count(A1c_change_cat, name = "Count") %>%
mutate(
Denominator = sum(Count),
Percent = round(100 * Count / Denominator, 1)
)
chg_tab
## # A tibble: 3 × 4
## A1c_change_cat Count Denominator Percent
## <chr> <int> <int> <dbl>
## 1 Improved (≥1.0% reduction) 19 29 65.5
## 2 No meaningful change 9 29 31
## 3 Worsened (≥1.0% increase) 1 29 3.4
# Optional: simple bar plot for A1c change distribution
a1c_plot_df <- chg_tab %>%
mutate(
A1c_change_cat = factor(
A1c_change_cat,
levels = c("Improved (≥1.0% reduction)", "No meaningful change", "Worsened (≥1.0% increase)")
),
label = paste0(Percent, "%")
)
ggplot(a1c_plot_df, aes(x = A1c_change_cat, y = Percent, fill = A1c_change_cat)) +
geom_col(width = 0.7) +
geom_text(aes(label = label), vjust = -0.3, size = 4) +
theme_bw() +
labs(
title = "HbA1c Change from Baseline to T1 (Cohort 1)",
x = "",
y = "% of evaluable patients"
) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(size = 10)
)
