df <- read_excel("FIM Data 2-26-2026 (DI).xlsx",
col_types = "text") %>%
clean_names()
names(df)
## [1] "pre_enrollment_notes"
## [2] "cohort"
## [3] "enrolled"
## [4] "age"
## [5] "preferred_language"
## [6] "disch_date_time"
## [7] "cc"
## [8] "admission_diagnosis"
## [9] "admitting_provider"
## [10] "discharge_disposition"
## [11] "lace_readmission_score_score_column"
## [12] "unplanned_readmission_score"
## [13] "primary_cvg"
## [14] "hb_a1c_value_last_3_months"
## [15] "pt_portal_status"
## [16] "hosp_last_365_days"
## [17] "ed_last_365_days"
## [18] "ed_vis_last_90_days"
## [19] "hosp_last_90_days"
## [20] "pcp"
## [21] "race"
## [22] "ethnicity"
## [23] "legal_sex"
## [24] "sdoh_high_risk_domains"
## [25] "zip_code"
## [26] "prescreen_contacted_y_n"
## [27] "ps_in_the_past_12_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"
## [28] "ps_in_the_past_12_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
## [29] "ps_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"
## [30] "pre_screen_interested_in_program"
## [31] "t0_contact"
## [32] "screen_for_enrollment_interested_in_program"
## [33] "t0_are_you_likely_to_eat_the_food_as_part_of_your_normal_daily_routine"
## [34] "t0_in_the_past_week_how_often_did_you_check_your_blood_sugar"
## [35] "t0_are_you_confident_in_making_food_choices_that_help_control_your_blood_sugar"
## [36] "t0_are_you_confident_in_understanding_how_to_read_food_labels_and_nutrition_information"
## [37] "t0_were_your_sugar_levels_usually_in_your_target_range_last_week"
## [38] "t0_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"
## [39] "t0_since_your_discharge_from_the_hospital_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [40] "t1_contact_y_n"
## [41] "t1_notes"
## [42] "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
## [43] "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
## [44] "if_yes_why_and_how_much_food_was_thrown_away_44"
## [45] "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
## [46] "t1_in_the_past_week_how_often_did_you_check_your_blood_sugar"
## [47] "t1_were_your_sugar_levels_usually_in_your_target_range_last_week"
## [48] "t1_did_you_have_any_symptoms_of_high_or_low_blood_sugar_in_the_past_week"
## [49] "t1_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"
## [50] "do_you_have_access_to_your_medications_escalate_as_needed_50"
## [51] "t1_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [52] "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
## [53] "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
## [54] "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
## [55] "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
## [56] "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
## [57] "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
## [58] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_58"
## [59] "t1_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"
## [60] "t2_contact_y_n"
## [61] "t2_notes"
## [62] "t2_in_the_past_6_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"
## [63] "t2_in_the_past_6_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
## [64] "t2_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"
## [65] "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
## [66] "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
## [67] "if_yes_why_and_how_much_food_was_thrown_away_67"
## [68] "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
## [69] "t2_in_the_past_week_how_often_did_you_check_your_blood_sugar"
## [70] "do_you_have_access_to_your_medications_escalate_as_needed_70"
## [71] "t2_were_your_sugar_levels_usually_in_your_target_range_last_week"
## [72] "t2_in_the_past_week_did_you_have_any_symptoms_of_high_or_low_blood_sugar"
## [73] "t2_in_the_past_week_did_you_miss_any_doses_of_your_diabetes_medication"
## [74] "t2_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [75] "t2_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_drinking_more_water"
## [76] "t2_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
## [77] "t2_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
## [78] "t2_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
## [79] "t2_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
## [80] "t2_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_foods_you_typically_have_at_home"
## [81] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_81"
## [82] "t2_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"
## [83] "hemoglobin_a1c_8_1_25_10_2_2025"
## [84] "hospital_utilization_8_1_25_10_2_25"
## [85] "a1c_change_8_1_25_10_2_25"
## [86] "a1c_under_8_8_1_25_10_2_25"
## [87] "hemoglobin_a1c_11_20_2_20_t1"
## [88] "ed_utilization_t1"
## [89] "ip_utilization_t1"
## [90] "a1c_change_t1"
## [91] "a1c_under_8_t1"
## [92] "additional_feedback"
## [93] "hemoglobin_a1c_t2"
## [94] "a1c_date_t2"
## [95] "ed_utilization_t2"
## [96] "hospital_utilization_t2"
## [97] "a1c_change_t2"
## [98] "a1c_under_8_t2"
## [99] "housing"
## [100] "housing_2"
## [101] "food"
## [102] "utilities"
## [103] "transport"
## [104] "finance"
## [105] "caregiver"
## [106] "legal"
## [107] "violence"
## [108] "employment"
## [109] "education"
## [110] "completed"
view(df)
# Confirm required columns exist
stopifnot("enrolled" %in% names(df))
stopifnot("cohort" %in% names(df))
df_clean <- df %>%
mutate(
enrolled_clean = str_to_upper(str_squish(as.character(enrolled))),
cohort_clean = str_squish(as.character(cohort)),
cohort_num = suppressWarnings(as.integer(str_extract(cohort_clean, "\\d+")))
)
df_status <- df_clean %>%
mutate(
enrolled_group = case_when(
enrolled_clean == "Y" ~ "Enrolled",
TRUE ~ "Not enrolled"
),
cohort_final = case_when(
enrolled_group == "Enrolled" & cohort_num == 1 ~ "Cohort 1",
enrolled_group == "Enrolled" & cohort_num == 2 ~ "Cohort 2",
enrolled_group == "Enrolled" & cohort_num == 3 ~ "Cohort 3",
TRUE ~ "Not enrolled"
)
)
cohort_counts <- df_status %>%
count(cohort_final, name = "N") %>%
arrange(cohort_final)
cohort_counts
## # A tibble: 4 × 2
## cohort_final N
## <chr> <int>
## 1 Cohort 1 60
## 2 Cohort 2 41
## 3 Cohort 3 35
## 4 Not enrolled 817
a1c_t1_col <- "a1c_change_t1"
ed_t1_col <- "ed_utilization_t1"
ip_t1_col <- "ip_utilization_t1"
a1c_t2_col <- "a1c_change_t2"
ed_t2_col <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"
# ----------------------------
# 7) Safety checks: chart-review columns exist
# ----------------------------
chart_cols <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
missing_cols <- setdiff(chart_cols, names(df_status))
if (length(missing_cols) > 0) {
stop(
"These chart-review columns are missing from df_status:\n - ",
paste(missing_cols, collapse = "\n - "),
"\n\nRun: names(df_status) and update the *_col variables to match."
)
}
# ----------------------------
# 8) Cohort subsets
# ----------------------------
df_c1 <- df_status %>% filter(cohort_final == "Cohort 1")
df_c2 <- df_status %>% filter(cohort_final == "Cohort 2")
df_c3 <- df_status %>% filter(cohort_final == "Cohort 3")
# ----------------------------
# 9) Completion helpers
# - "nonblank" means not NA and not "" and not "NA"/"N/A"/"NULL"
# ----------------------------
is_nonblank <- function(x) {
x_chr <- as.character(x)
x_std <- str_to_upper(str_squish(x_chr))
!(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}
pct_complete <- function(df, column) {
total <- nrow(df)
if (total == 0) return("-")
complete_n <- sum(is_nonblank(df[[column]]))
pct <- round(100 * complete_n / total, 1)
paste0(complete_n, "/", total, " (", pct, "%)")
}
pct_complete_all_required <- function(df, cols_required) {
total <- nrow(df)
if (total == 0) return("-")
complete_n <- sum(apply(df[, cols_required, drop = FALSE], 1, function(row) {
all(is_nonblank(row))
}))
pct <- round(100 * complete_n / total, 1)
paste0(complete_n, "/", total, " (", pct, "%)")
}
# ----------------------------
# 10) Basic cohort size table
# ----------------------------
cohort_size_table <- tibble(
Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3))
)
cohort_size_table
## # A tibble: 3 × 2
## Cohort N
## <chr> <int>
## 1 Cohort 1 60
## 2 Cohort 2 41
## 3 Cohort 3 35
# ----------------------------
# CHART REVIEW
# ----------------------------
c1_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
c2_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col)
chart_review_table <- tibble(
Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3)),
# Field-level completeness
`A1c Change (T1)` = c(pct_complete(df_c1, a1c_t1_col), pct_complete(df_c2, a1c_t1_col), "-"),
`ED Utilization (T1)` = c(pct_complete(df_c1, ed_t1_col), pct_complete(df_c2, ed_t1_col), "-"),
`IP Utilization (T1)` = c(pct_complete(df_c1, ip_t1_col), pct_complete(df_c2, ip_t1_col), "-"),
`A1c Change (T2)` = c(pct_complete(df_c1, a1c_t2_col), "-", "-"),
`ED Utilization (T2)` = c(pct_complete(df_c1, ed_t2_col), "-", "-"),
`Hospital Utilization (T2)` = c(pct_complete(df_c1, hosp_t2_col), "-", "-"),
# Cohort-level "complete chart review"
`Chart review complete (per cohort rules)` = c(
pct_complete_all_required(df_c1, c1_required),
pct_complete_all_required(df_c2, c2_required),
"-"
)
)
chart_review_table
## # A tibble: 3 × 9
## Cohort N `A1c Change (T1)` `ED Utilization (T1)` `IP Utilization (T1)`
## <chr> <int> <chr> <chr> <chr>
## 1 Cohort 1 60 29/60 (48.3%) 60/60 (100%) 60/60 (100%)
## 2 Cohort 2 41 23/41 (56.1%) 41/41 (100%) 41/41 (100%)
## 3 Cohort 3 35 - - -
## # ℹ 4 more variables: `A1c Change (T2)` <chr>, `ED Utilization (T2)` <chr>,
## # `Hospital Utilization (T2)` <chr>,
## # `Chart review complete (per cohort rules)` <chr>
# -------------------------
# A1c Change (T1): Cohort 1 + Cohort 2
# -------------------------
a1c_t1 <- df_status %>%
filter(cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
transmute(
delta = suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(.data[[a1c_t1_col]])), "[^0-9.\\-]", "")))
) %>%
filter(!is.na(delta)) %>%
mutate(
Category = case_when(
delta <= -1.0 ~ "Improved (≥1.0% ↓)",
delta >= 1.0 ~ "Worsened (≥1.0% ↑)",
TRUE ~ "No meaningful change (<1.0%)"
),
Category = factor(Category, levels = c("Improved (≥1.0% ↓)", "No meaningful change (<1.0%)", "Worsened (≥1.0% ↑)"))
)
n_t1 <- nrow(a1c_t1)
a1c_t1_tab <- a1c_t1 %>%
count(Category, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(pct, "%\n(n=", n, ")")
)
# ---- small helper: parse numeric safely ----
parse_delta <- function(x) {
suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(x)), "[^0-9.\\-]", "")))
}
# ---- style settings (you asked for color) ----
col_improved <- "#2E7D32" # green
col_nochange <- "#757575" # gray
col_worsened <- "#C62828" # red
make_a1c_plots <- function(df_in, delta_col, title_text) {
d <- df_in %>%
transmute(delta = parse_delta(.data[[delta_col]])) %>%
filter(!is.na(delta)) %>%
mutate(
Category = case_when(
delta <= -1.0 ~ "Improved (≥1.0%)",
delta >= 1.0 ~ "Worsened (≥1.0%)",
TRUE ~ "No meaningful change"
),
Category = factor(Category, levels = c("Improved (≥1.0%)", "No meaningful change", "Worsened (≥1.0%)"))
)
n <- nrow(d)
tab <- d %>%
count(Category, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(pct, "% (n=", n, ")")
)
# 1) Colored category bar chart
p_bar <- ggplot(tab, aes(x = Category, y = pct, fill = Category)) +
geom_col(width = 0.72) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_fill_manual(values = c(
"Improved (≥1.0%)" = col_improved,
"No meaningful change" = col_nochange,
"Worsened (≥1.0%)" = col_worsened
)) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
labs(title = paste0(title_text, " (n = ", n, ")"), x = NULL, y = "% of cohort") +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(size = 12),
legend.position = "none"
)
# 2) Replace histogram with: jitter + boxplot + threshold bands/lines
p_dist <- ggplot(d, aes(x = "", y = delta)) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -1, ymax = 1, alpha = 0.10, fill = col_nochange) +
geom_hline(yintercept = c(-1, 1), linetype = "dashed", linewidth = 0.7, color = "black") +
geom_boxplot(width = 0.22, outlier.shape = NA, alpha = 0.35) +
geom_jitter(width = 0.12, height = 0, alpha = 0.55, size = 1.6) +
coord_flip() +
labs(
title = "A1c change distribution (each dot = 1 patient)",
x = NULL,
y = "A1c change (percentage points)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
list(tab = tab, bar = p_bar, dist = p_dist)
}
# =========================
# T1: Cohort 1 + Cohort 2
# =========================
df_a1c_t1 <- df_status %>% filter(cohort_final %in% c("Cohort 1", "Cohort 2"))
out_t1 <- make_a1c_plots(df_a1c_t1, a1c_t1_col, "A1C Change (T1): Cohort 1 + Cohort 2")
out_t1$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 32 62.7 62.7% (n=32)
## 2 No meaningful change 17 33.3 33.3% (n=17)
## 3 Worsened (≥1.0%) 2 3.9 3.9% (n=2)
out_t1$bar

# =========================
# T2: Cohort 1 only
# =========================
df_a1c_t2 <- df_status %>% filter(cohort_final == "Cohort 1")
out_t2 <- make_a1c_plots(df_a1c_t2, a1c_t2_col, "A1C Change (T2): Cohort 1")
out_t2$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 19 63.3 63.3% (n=19)
## 2 No meaningful change 9 30 30% (n=9)
## 3 Worsened (≥1.0%) 2 6.7 6.7% (n=2)
out_t2$bar

# -------------------------
# Column names you specified
# -------------------------
col_a1c_all <- "hb_a1c_value_last_3_months"
col_a1c_t1 <- "hemoglobin_a1c_11_20_2_20_t1"
col_a1c_t2 <- "hemoglobin_a1c_t2"
need_cols <- c(col_a1c_all, col_a1c_t1, col_a1c_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ", paste(missing, collapse = "\n - "),
"\n\nCheck names(df_status) and fix the col_a1c_* variables above.")
}
# -------------------------
# Clean HbA1c values → numeric
# -------------------------
clean_a1c <- function(x) {
x <- str_squish(as.character(x))
x[x %in% c("", "NA", "N/A", "na", "n/a", "NULL", "null")] <- NA_character_
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
x <- gsub("%", "", x)
suppressWarnings(as.numeric(x))
}
# -------------------------
# Build long dataset (3 panels)
# -------------------------
a1c_long <- df_status %>%
transmute(
id = row_number(),
A1c_All = clean_a1c(.data[[col_a1c_all]]),
A1c_T1 = clean_a1c(.data[[col_a1c_t1]]),
A1c_T2 = clean_a1c(.data[[col_a1c_t2]])
) %>%
pivot_longer(cols = c(A1c_All, A1c_T1, A1c_T2),
names_to = "Timepoint",
values_to = "HbA1c") %>%
filter(!is.na(HbA1c)) %>%
mutate(
Timepoint = recode(Timepoint,
A1c_All = "All patients (chart review HbA1c)",
A1c_T1 = "T1 HbA1c",
A1c_T2 = "T2 HbA1c"),
Timepoint = factor(Timepoint,
levels = c("All patients (chart review HbA1c)", "T1 HbA1c", "T2 HbA1c")),
Hb_bin = floor(HbA1c),
Hb_mid = Hb_bin + 0.5
)
# -------------------------
# Summarize into 1.0%-wide bins and label percentages
# -------------------------
a1c_hist <- a1c_long %>%
count(Timepoint, Hb_bin, Hb_mid, name = "n") %>%
group_by(Timepoint) %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = ifelse(pct > 0, paste0(pct, "%"), ""),
N = sum(n)
) %>%
ungroup()
# For facet labels with n
facet_labs <- setNames(
paste0(levels(a1c_hist$Timepoint), " (n = ", tapply(a1c_long$HbA1c, a1c_long$Timepoint, length), ")"),
levels(a1c_hist$Timepoint)
)
# Determine x-range
min_bin <- min(a1c_hist$Hb_bin, na.rm = TRUE)
max_bin <- max(a1c_hist$Hb_bin, na.rm = TRUE)
# -------------------------
# Plot (3 stacked panels)
# -------------------------
ggplot(a1c_hist, aes(x = Hb_mid, y = pct)) +
geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
geom_text(aes(label = label), vjust = -0.35, size = 3.4) +
facet_wrap(
~ Timepoint,
ncol = 1,
labeller = labeller(Timepoint = facet_labs)
) +
scale_x_continuous(
breaks = seq(min_bin, max_bin, by = 1),
minor_breaks = NULL
) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
coord_cartesian(ylim = c(0, max(a1c_hist$pct, na.rm = TRUE) + 10)) +
theme_bw() +
labs(
title = "HbA1c Distributions (All vs T1 vs T2)",
x = "HbA1c (%)",
y = "% within timepoint"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold", size = 12),
axis.text = element_text(size = 11)
)

# -------------------------
# 1) COLUMN NAMES (EDIT THESE 5 to match names(df_status))
# -------------------------
col_ed_base <- "ed_vis_last_90_days"
col_ed_t1 <- "ed_utilization_t1"
col_ed_t2 <- "ed_utilization_t2"
col_ip_base <- "hosp_last_90_days" # <-- replace with your true IP baseline column (NOT hosp)
col_ip_t1 <- "ip_utilization_t1"
col_ip_t2 <- "hospital_utilization_t2"
# -------------------------
# 2) Helpers
# -------------------------
parse_count <- function(x) {
x <- as.character(x)
x <- str_squish(x)
x[x == ""] <- NA_character_
x[str_to_upper(x) %in% c("NA","N/A","NULL")] <- NA_character_
suppressWarnings(as.numeric(str_replace_all(x, "[^0-9\\-\\.]", "")))
}
make_bucket <- function(x, cap = 5) {
bx <- ifelse(x >= cap, cap, as.integer(x))
ifelse(bx == cap, paste0(cap, "+"), as.character(bx))
}
# -------------------------
# 3) Safety checks
# -------------------------
need_cols <- c(col_ed_base, col_ed_t1, col_ed_t2,
col_ip_base, col_ip_t1, col_ip_t2,
"enrolled_group")
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ", paste(missing, collapse = "\n - "),
"\n\nFix the col_* names above using names(df_status).")
}
# -------------------------
# 4) Colors (simple + clear)
# -------------------------
col_less <- "#2E7D32" # green
col_same <- "#757575" # gray
col_more <- "#C62828" # red
col_zero <- "#546E7A" # blue-gray
col_onep <- "#8E24AA" # purple
make_dist_df <- function(df_status, base_col, t1_col, t2_col, cap = 5, domain_label = "ED") {
d <- df_status %>%
transmute(
group = if_else(enrolled_group == "Enrolled", "Enrolled", "All patients"),
base = parse_count(.data[[base_col]]),
t1 = parse_count(.data[[t1_col]]),
t2 = parse_count(.data[[t2_col]])
) %>%
pivot_longer(cols = c(base, t1, t2), names_to = "timepoint", values_to = "count") %>%
filter(!is.na(count)) %>%
mutate(
timepoint = recode(timepoint,
base = "Baseline (last 90d)",
t1 = "T1",
t2 = "T2"),
timepoint = factor(timepoint, levels = c("Baseline (last 90d)", "T1", "T2")),
bucket = make_bucket(count, cap = cap),
bucket = factor(bucket, levels = c(as.character(0:(cap-1)), paste0(cap, "+"))),
group = factor(group, levels = c("All patients", "Enrolled")),
domain = domain_label
)
d %>%
count(domain, group, timepoint, bucket, name = "n") %>%
tidyr::complete(domain, group, timepoint, bucket, fill = list(n = 0)) %>% # ensures missing buckets show as 0
group_by(domain, group, timepoint) %>%
mutate(
denom = sum(n),
pct = ifelse(denom > 0, round(100 * n / denom, 1), 0),
label = ifelse(n > 0, paste0(pct, "%"), "")
) %>%
ungroup()
}
plot_dist <- function(tab, title_text, xlab) {
ggplot(tab, aes(x = bucket, y = pct)) +
geom_col(width = 0.8, fill = "#2E6F9E") +
geom_text(aes(label = label), vjust = -0.25, size = 3.6) +
facet_grid(group ~ timepoint) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(title = title_text, x = xlab, y = "% of patients within group") +
theme(
plot.title = element_text(face = "bold", size = 14),
strip.background = element_rect(fill = "grey90"),
strip.text = element_text(face = "bold")
)
}
# -------------------------
# 5) ED distribution plot (cap at 5+)
# -------------------------
ed_dist <- make_dist_df(df_status, col_ed_base, col_ed_t1, col_ed_t2, cap = 5, domain_label = "ED")
plot_dist(ed_dist, "ED Utilization Distribution: All patients vs Enrolled", "ED visits")

# -------------------------
# 6) IP distribution plot (cap at 3+)
# -------------------------
ip_dist <- make_dist_df(df_status, col_ip_base, col_ip_t1, col_ip_t2, cap = 3, domain_label = "IP")
plot_dist(ip_dist, "Inpatient Utilization Distribution: All patients vs Enrolled", "Inpatient stays")

# =========================
# 7) CHANGE PLOTS: Baseline → T1 (Enrolled only)
# X-axis: Less visits / No change / More visits
# =========================
# Build enrolled paired cohort (must have baseline + T1 for each domain)
df_util_pairs <- df_status %>%
filter(enrolled_group == "Enrolled") %>%
transmute(
ED_base = parse_count(.data[[col_ed_base]]),
ED_T1 = parse_count(.data[[col_ed_t1]]),
IP_base = parse_count(.data[[col_ip_base]]),
IP_T1 = parse_count(.data[[col_ip_t1]])
) %>%
filter(!is.na(ED_base) & !is.na(ED_T1),
!is.na(IP_base) & !is.na(IP_T1))
n_pairs <- nrow(df_util_pairs)
make_change_tab <- function(df, base_var, t1_var) {
df %>%
transmute(
Change = case_when(
.data[[t1_var]] < .data[[base_var]] ~ "Less visits",
.data[[t1_var]] > .data[[base_var]] ~ "More visits",
TRUE ~ "No change"
)
) %>%
count(Change, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(pct, "%\n(n=", n, ")"),
Change = factor(Change, levels = c("Less visits", "No change", "More visits"))
)
}
plot_change_bar <- function(tab, title_text) {
ggplot(tab, aes(x = Change, y = pct, fill = Change)) +
geom_col(width = 0.72) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_fill_manual(values = c(
"Less visits" = col_less,
"No change" = col_same,
"More visits" = col_more
)) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
labs(
title = title_text,
x = NULL,
y = "% of enrolled cohort"
) +
theme_bw() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "none"
)
}
# ED change
ed_change_tab <- make_change_tab(df_util_pairs, "ED_base", "ED_T1")
p_ed_change <- plot_change_bar(
ed_change_tab,
paste0("ED Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)
# IP change
ip_change_tab <- make_change_tab(df_util_pairs, "IP_base", "IP_T1")
p_ip_change <- plot_change_bar(
ip_change_tab,
paste0("IP Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)
# show tables if you want
ed_change_tab
## # A tibble: 3 × 4
## Change n pct label
## <fct> <int> <dbl> <chr>
## 1 Less visits 58 57.4 "57.4%\n(n=58)"
## 2 More visits 9 8.9 "8.9%\n(n=9)"
## 3 No change 34 33.7 "33.7%\n(n=34)"
ip_change_tab
## # A tibble: 3 × 4
## Change n pct label
## <fct> <int> <dbl> <chr>
## 1 Less visits 62 61.4 "61.4%\n(n=62)"
## 2 More visits 6 5.9 "5.9%\n(n=6)"
## 3 No change 33 32.7 "32.7%\n(n=33)"
# stacked like your screenshot
p_ed_change / p_ip_change

# =========================
# DEMOGRAPHIC TABLE
# Keeps original combined race table
# Also creates an expanded race table that splits multi-race responses
# =========================
race_col <- "race"
eth_col <- "ethnicity"
stopifnot(race_col %in% names(df_status))
stopifnot(eth_col %in% names(df_status))
canon_multiselect <- function(x) {
x <- as.character(x)
x <- str_trim(x)
x[x == ""] <- NA_character_
sapply(x, function(val) {
if (is.na(val)) return(NA_character_)
parts <- unlist(str_split(val, "\\r?\\n|\\s*[;,]\\s*"))
parts <- str_trim(parts)
parts <- parts[parts != ""]
if (length(parts) == 0) return(NA_character_)
paste(sort(unique(parts)), collapse = "; ")
}, USE.NAMES = FALSE)
}
race_map <- c(
"R1" = "American Indian / Alaska Native",
"R2" = "Asian",
"R3" = "Black or African-American",
"R4" = "Native Hawaiian / Pacific Islander",
"R5" = "White",
"R9" = "Other",
"S1" = "Patient Declined",
"S2" = "Patient Unavailable",
"S3" = "Not Applicable / Unknown"
)
clean_race_value <- function(x_vec) {
x_vec <- canon_multiselect(x_vec)
sapply(x_vec, function(s) {
if (is.na(s)) return(NA_character_)
items <- str_split(s, ";\\s*")[[1]]
items <- str_trim(items)
recoded <- vapply(items, function(it) {
code <- str_extract(it, "^[A-Za-z]\\d+")
if (!is.na(code) && code %in% names(race_map)) race_map[[code]] else it
}, character(1))
paste(sort(unique(recoded)), collapse = "; ")
}, USE.NAMES = FALSE)
}
clean_eth_value <- function(x_vec) {
x_vec <- canon_multiselect(x_vec)
sapply(x_vec, function(s) {
if (is.na(s)) return(NA_character_)
items <- str_split(s, ";\\s*")[[1]]
items <- str_trim(items)
recoded <- dplyr::case_when(
str_detect(items, "E1") ~ "Spanish / Hispanic / Latino",
str_detect(items, "E2") ~ "Not Spanish / Hispanic / Latino",
str_detect(items, "S1") ~ "Patient Declined",
str_detect(items, "S2") ~ "Patient Unavailable",
TRUE ~ NA_character_
)
recoded <- recoded[!is.na(recoded)]
if (length(recoded) == 0) return(NA_character_)
paste(sort(unique(recoded)), collapse = "; ")
}, USE.NAMES = FALSE)
}
df_status <- df_status %>%
mutate(
race_clean = clean_race_value(.data[[race_col]]),
ethnicity_clean = clean_eth_value(.data[[eth_col]])
)
fmt_np <- function(n, denom) {
if (is.na(n) || denom == 0) return("-")
paste0(n, " (", sprintf("%.1f", 100 * n / denom), "%)")
}
make_demo_table <- function(data, variable, label) {
d <- data %>%
mutate(Value = canon_multiselect(.data[[variable]])) %>%
filter(!is.na(Value))
denom_all <- nrow(d)
denom_not <- nrow(filter(d, enrolled_group == "Not enrolled"))
denom_enr <- nrow(filter(d, enrolled_group == "Enrolled"))
denom_c1 <- nrow(filter(d, cohort_final == "Cohort 1"))
denom_c2 <- nrow(filter(d, cohort_final == "Cohort 2"))
denom_c3 <- nrow(filter(d, cohort_final == "Cohort 3"))
all_ct <- d %>% count(Value, name = "n_all")
not_ct <- d %>% filter(enrolled_group == "Not enrolled") %>% count(Value, name = "n_not")
enr_ct <- d %>% filter(enrolled_group == "Enrolled") %>% count(Value, name = "n_enr")
c1_ct <- d %>% filter(cohort_final == "Cohort 1") %>% count(Value, name = "n_c1")
c2_ct <- d %>% filter(cohort_final == "Cohort 2") %>% count(Value, name = "n_c2")
c3_ct <- d %>% filter(cohort_final == "Cohort 3") %>% count(Value, name = "n_c3")
all_ct %>%
full_join(not_ct, by = "Value") %>%
full_join(enr_ct, by = "Value") %>%
full_join(c1_ct, by = "Value") %>%
full_join(c2_ct, by = "Value") %>%
full_join(c3_ct, by = "Value") %>%
mutate(
Category = label,
`All Patients` = mapply(fmt_np, n_all, denom_all),
`Not enrolled` = mapply(fmt_np, n_not, denom_not),
`Enrolled` = mapply(fmt_np, n_enr, denom_enr),
`Cohort 1` = mapply(fmt_np, n_c1, denom_c1),
`Cohort 2` = mapply(fmt_np, n_c2, denom_c2),
`Cohort 3` = mapply(fmt_np, n_c3, denom_c3)
) %>%
select(Category, Value, `All Patients`, `Not enrolled`, `Enrolled`, `Cohort 1`, `Cohort 2`, `Cohort 3`) %>%
arrange(Category, Value)
}
make_demo_table_expanded <- function(data, variable, label) {
d <- data %>%
filter(!is.na(.data[[variable]])) %>%
mutate(
Value = str_split(.data[[variable]], ";\\s*")
) %>%
tidyr::unnest(Value) %>%
mutate(
Value = str_trim(as.character(Value)),
Value = na_if(Value, "")
) %>%
filter(!is.na(Value))
denom_all <- nrow(filter(data, !is.na(.data[[variable]])))
denom_not <- nrow(filter(data, !is.na(.data[[variable]]), enrolled_group == "Not enrolled"))
denom_enr <- nrow(filter(data, !is.na(.data[[variable]]), enrolled_group == "Enrolled"))
denom_c1 <- nrow(filter(data, !is.na(.data[[variable]]), cohort_final == "Cohort 1"))
denom_c2 <- nrow(filter(data, !is.na(.data[[variable]]), cohort_final == "Cohort 2"))
denom_c3 <- nrow(filter(data, !is.na(.data[[variable]]), cohort_final == "Cohort 3"))
all_ct <- d %>% count(Value, name = "n_all")
not_ct <- d %>% filter(enrolled_group == "Not enrolled") %>% count(Value, name = "n_not")
enr_ct <- d %>% filter(enrolled_group == "Enrolled") %>% count(Value, name = "n_enr")
c1_ct <- d %>% filter(cohort_final == "Cohort 1") %>% count(Value, name = "n_c1")
c2_ct <- d %>% filter(cohort_final == "Cohort 2") %>% count(Value, name = "n_c2")
c3_ct <- d %>% filter(cohort_final == "Cohort 3") %>% count(Value, name = "n_c3")
all_ct %>%
full_join(not_ct, by = "Value") %>%
full_join(enr_ct, by = "Value") %>%
full_join(c1_ct, by = "Value") %>%
full_join(c2_ct, by = "Value") %>%
full_join(c3_ct, by = "Value") %>%
mutate(
Category = label,
`All Patients` = mapply(fmt_np, n_all, denom_all),
`Not enrolled` = mapply(fmt_np, n_not, denom_not),
`Enrolled` = mapply(fmt_np, n_enr, denom_enr),
`Cohort 1` = mapply(fmt_np, n_c1, denom_c1),
`Cohort 2` = mapply(fmt_np, n_c2, denom_c2),
`Cohort 3` = mapply(fmt_np, n_c3, denom_c3)
) %>%
select(Category, Value, `All Patients`, `Not enrolled`, `Enrolled`, `Cohort 1`, `Cohort 2`, `Cohort 3`) %>%
arrange(Category, Value)
}
race_table_combined <- make_demo_table(df_status, "race_clean", "Race")
race_table_expanded <- make_demo_table_expanded(df_status, "race_clean", "Race")
ethnicity_table <- make_demo_table(df_status, "ethnicity_clean", "Ethnicity")
demographic_table_old <- bind_rows(race_table_combined, ethnicity_table)
demographic_table <- bind_rows(race_table_expanded, ethnicity_table)
demographic_table
## # A tibble: 12 × 8
## Category Value `All Patients` `Not enrolled` Enrolled `Cohort 1` `Cohort 2`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Race Ameri… 4 (0.4%) 3 (0.4%) 1 (0.7%) 1 (1.7%) -
## 2 Race Asian 31 (3.3%) 29 (3.5%) 2 (1.5%) - -
## 3 Race Black… 252 (26.4%) 195 (23.9%) 57 (41.… 25 (41.7%) 15 (36.6%)
## 4 Race Nativ… 3 (0.3%) 2 (0.2%) 1 (0.7%) 1 (1.7%) -
## 5 Race Other 285 (29.9%) 233 (28.5%) 52 (38.… 24 (40.0%) 18 (43.9%)
## 6 Race Patie… 38 (4.0%) 36 (4.4%) 2 (1.5%) - 2 (4.9%)
## 7 Race Patie… 2 (0.2%) 2 (0.2%) - - -
## 8 Race White 404 (42.4%) 374 (45.8%) 30 (22.… 14 (23.3%) 9 (22.0%)
## 9 Ethnicity Not S… 639 (67.2%) 556 (68.2%) 83 (61.… 36 (60.0%) 22 (53.7%)
## 10 Ethnicity Patie… 34 (3.6%) 33 (4.0%) 1 (0.7%) - 1 (2.4%)
## 11 Ethnicity Patie… 2 (0.2%) 2 (0.2%) - - -
## 12 Ethnicity Spani… 276 (29.0%) 224 (27.5%) 52 (38.… 24 (40.0%) 18 (43.9%)
## # ℹ 1 more variable: `Cohort 3` <chr>
demographic_table_old
## # A tibble: 23 × 8
## Category Value `All Patients` `Not enrolled` Enrolled `Cohort 1` `Cohort 2`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Race Americ… 4 (0.4%) 3 (0.4%) 1 (0.7%) 1 (1.7%) -
## 2 Race Asian 29 (3.0%) 27 (3.3%) 2 (1.5%) - -
## 3 Race Asian;… 1 (0.1%) 1 (0.1%) - - -
## 4 Race Asian;… 1 (0.1%) 1 (0.1%) - - -
## 5 Race Black … 234 (24.6%) 180 (22.0%) 54 (39.… 23 (38.3%) 14 (34.1%)
## 6 Race Black … 1 (0.1%) - 1 (0.7%) 1 (1.7%) -
## 7 Race Black … 12 (1.3%) 11 (1.3%) 1 (0.7%) 1 (1.7%) -
## 8 Race Black … 3 (0.3%) 2 (0.2%) 1 (0.7%) - 1 (2.4%)
## 9 Race Black … 2 (0.2%) 2 (0.2%) - - -
## 10 Race Native… 2 (0.2%) 2 (0.2%) - - -
## # ℹ 13 more rows
## # ℹ 1 more variable: `Cohort 3` <chr>
# -------------------------
# 0) COLUMN NAMES (edit if needed)
# -------------------------
race_col <- "race"
sex_col <- "legal_sex"
eth_col <- "ethnicity"
needed_demo_cols <- c("enrolled_group", race_col, sex_col, eth_col)
missing_demo_cols <- setdiff(needed_demo_cols, names(df_status))
if (length(missing_demo_cols) > 0) {
stop(
"Missing demographic columns:\n - ",
paste(missing_demo_cols, collapse = "\n - ")
)
}
# -------------------------
# Helper for legend labels
# -------------------------
make_legend_labels <- function(df_long) {
sizes <- df_long %>%
distinct(id, Group) %>%
count(Group, name = "N")
setNames(
paste0(sizes$Group, " (n = ", sizes$N, ")"),
sizes$Group
)
}
# =========================
# 1) RACE
# =========================
race_map <- c(
"R1" = "American Indian / Alaska Native",
"R2" = "Asian",
"R3" = "Black or African American",
"R4" = "Native Hawaiian / Pacific Islander",
"R5" = "White",
"R9" = "Other"
)
race_long <- df_status %>%
transmute(
id = row_number(),
enrolled_group,
Race_raw = str_squish(as.character(.data[[race_col]]))
) %>%
filter(!is.na(Race_raw), Race_raw != "") %>%
mutate(Race_raw = str_split(Race_raw, "\\r?\\n")) %>%
unnest(Race_raw) %>%
mutate(
Race_code = str_extract(Race_raw, "^R\\d+"),
Race_clean = unname(race_map[Race_code])
) %>%
filter(!is.na(Race_clean))
race_tab <- bind_rows(
race_long %>% mutate(Group = "All patients"),
race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
count(Group, Race_clean, name = "n") %>%
group_by(Group) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup()
race_tab <- race_tab %>%
mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
race_legend_labels <- make_legend_labels(
bind_rows(
race_long %>% mutate(Group = "All patients"),
race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
)
)
ggplot(race_tab, aes(x = Race_clean, y = percent, fill = Group)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(percent, "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3.6) +
coord_flip(clip = "off") +
scale_fill_manual(values = c("All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"),
labels = race_legend_labels) +
expand_limits(y = max(race_tab$percent, na.rm = TRUE) + 8) +
theme_bw() +
labs(title = "Race Distribution: Enrolled vs All Patients",
x = NULL,
y = "% within group",
fill = "") +
theme(plot.title = element_text(face = "bold"),
legend.position = "top")

# =========================
# 2) LEGAL SEX
# =========================
sex_long <- df_status %>%
transmute(
id = row_number(),
enrolled_group,
Sex = str_squish(as.character(.data[[sex_col]]))
) %>%
filter(!is.na(Sex), Sex != "")
sex_tab <- bind_rows(
sex_long %>% mutate(Group = "All patients"),
sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
count(Group, Sex, name = "n") %>%
group_by(Group) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
sex_legend_labels <- make_legend_labels(
bind_rows(
sex_long %>% mutate(Group = "All patients"),
sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
)
)
ggplot(sex_tab, aes(x = Sex, y = percent, fill = Group)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(percent, "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3.6) +
coord_flip(clip = "off") +
scale_fill_manual(values = c("All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"),
labels = sex_legend_labels) +
expand_limits(y = max(sex_tab$percent, na.rm = TRUE) + 8) +
theme_bw() +
labs(title = "Legal Sex: Enrolled vs All Patients",
x = NULL,
y = "% within group",
fill = "") +
theme(plot.title = element_text(face = "bold"),
legend.position = "top")

# =========================
# 3) ETHNICITY
# =========================
eth_long <- df_status %>%
transmute(
id = row_number(),
enrolled_group,
Eth_raw = str_squish(as.character(.data[[eth_col]])),
Eth_clean = case_when(
str_detect(Eth_raw, "E1") ~ "Spanish/Hispanic/Latino",
str_detect(Eth_raw, "E2") ~ "Not Spanish/Hispanic/Latino",
str_detect(Eth_raw, "S1") ~ "Patient declined",
str_detect(Eth_raw, "S2") ~ "Patient unavailable",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(Eth_clean), Eth_clean != "")
eth_tab <- bind_rows(
eth_long %>% mutate(Group = "All patients"),
eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
count(Group, Eth_clean, name = "n") %>%
group_by(Group) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
eth_legend_labels <- make_legend_labels(
bind_rows(
eth_long %>% mutate(Group = "All patients"),
eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
)
)
ggplot(eth_tab, aes(x = Eth_clean, y = percent, fill = Group)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(percent, "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3.6) +
coord_flip(clip = "off") +
scale_fill_manual(values = c("All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"),
labels = eth_legend_labels) +
expand_limits(y = max(eth_tab$percent, na.rm = TRUE) + 8) +
theme_bw() +
labs(title = "Ethnicity: Enrolled vs All Patients",
x = NULL,
y = "% within group",
fill = "") +
theme(plot.title = element_text(face = "bold"),
legend.position = "top")

# =========================
# T1 SELF-MANAGEMENT (PAST WEEK)
# =========================
col_t1_manage <- "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
# ---- Safety check ----
if (!col_t1_manage %in% names(df_status)) {
stop("Column not found:\n ", col_t1_manage,
"\n\nCheck names(df_status) and update col_t1_manage.")
}
# ---- 1) Clean + categorize responses ----
# ---- 1) Clean + categorize responses (FIXED: blanks/not surveyed -> NA) ----
missing_tokens <- c(
"", "NA", "N/A", "NULL", "NONE",
"NOT SURVEYED", "NOT ASKED", "NOT APPLICABLE",
"SKIPPED", "MISSING", "UNKNOWN"
)
df_manage <- df_status %>%
transmute(
raw = .data[[col_t1_manage]],
raw_chr = as.character(raw),
raw_std = str_to_upper(str_squish(raw_chr)),
category = case_when(
# TRUE missing (actual NA)
is.na(raw_chr) ~ NA_character_,
# empty/whitespace after squish
raw_std == "" ~ NA_character_,
# common placeholders meaning "not surveyed" / missing
raw_std %in% missing_tokens ~ NA_character_,
# if your system uses "-" or "." as blank, include them:
raw_std %in% c("-", ".", "--") ~ NA_character_,
# actual responses
raw_std %in% c("N", "NO") ~ "No",
raw_std %in% c("Y", "YES") ~ "Yes (unspecified)",
str_detect(raw_std, "^DAILY") ~ "Daily",
str_detect(raw_std, "OCCASION") ~ "Occasionally",
TRUE ~ "Other / unclear"
)
)
# ---- 2) Summary table ----
manage_tab <- df_manage %>%
filter(!is.na(category)) %>%
count(category, name = "n") %>%
mutate(
percent = round(100 * n / sum(n), 1)
) %>%
arrange(desc(n))
manage_tab
## # A tibble: 4 × 3
## category n percent
## <chr> <int> <dbl>
## 1 Daily 22 55
## 2 Occasionally 12 30
## 3 Yes (unspecified) 5 12.5
## 4 No 1 2.5
# ---- 3) Bar chart ----
ggplot(manage_tab, aes(x = reorder(category, percent), y = percent)) +
geom_col(width = 0.7, fill = "#4E79A7") +
geom_text(
aes(label = paste0(percent, "% (n=", n, ")")),
hjust = -0.05,
size = 3.6
) +
coord_flip(clip = "off") +
expand_limits(y = max(manage_tab$percent, na.rm = TRUE) + 8) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = "T1: Diabetes Self-Management Actions (Past Week)",
x = "",
y = "% of respondents"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11)
)

col_help_choices <- "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
col_conf_food <- "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
col_conf_labels <- "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
col_conf_prepare <- "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
col_help_control <- "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
# CLEAN + STANDARDIZE LIKERT RESPONSES
# =========================
# =========================
# CLEANERS
# =========================
# A) Confidence-style questions (maps numeric + misspellings + MAYBE)
clean_confidence <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric scales (support both 1-4 and 1-5)
s %in% c("1") ~ "Not confident",
s %in% c("2") ~ "Somewhat not confident",
s %in% c("3") ~ "Confident",
s %in% c("4") ~ "Very confident",
s %in% c("5") ~ "Very confident", # if a stray 5 appears, bucket to Very confident
# explicit text + common typos
str_detect(s, "NOT CONFIDENT") ~ "Not confident",
str_detect(s, "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
str_detect(s, "\\bMAYBE\\b") ~ "Maybe",
s %in% c("CONFIDENT", "COFIDENT", "CONFIDEMT", "CONFIDNT", "COFIDENT") ~ "Confident",
str_detect(s, "VERY CONFIDENT") ~ "Very confident",
str_detect(s, "SOMEWHAT CONFIDENT") ~ "Somewhat confident",
TRUE ~ "Other / unclear"
)
}
# B) Program-help / impact-style questions (A lot / Somewhat / A little, etc.)
clean_impact <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric (if any)
s %in% c("5") ~ "A lot",
s %in% c("4") ~ "Somewhat",
s %in% c("3") ~ "A little",
s %in% c("2") ~ "A little",
s %in% c("1") ~ "Not at all",
# text variants (case + spacing)
str_detect(s, "^A LOT") ~ "A lot",
str_detect(s, "^SOMEWHAT") ~ "Somewhat",
str_detect(s, "A LITTLE") ~ "A little",
str_detect(s, "NOT AT ALL") ~ "Not at all",
# yes/no (if it sneaks in)
s %in% c("YES", "Y") ~ "Yes",
s %in% c("NO", "N") ~ "No",
TRUE ~ "Other / unclear"
)
}
plot_t1_question <- function(df, col_name, title_text, cleaner_fun) {
if (!col_name %in% names(df)) stop("Column not found: ", col_name)
df_plot <- df %>%
transmute(response = cleaner_fun(.data[[col_name]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
arrange(percent)
ggplot(df_plot, aes(x = reorder(response, percent), y = percent)) +
geom_col(fill = "#1F4E79", width = 0.7) +
geom_text(aes(label = paste0(percent, "% (n=", n, ")")),
hjust = -0.05, size = 3.6) +
coord_flip(clip = "off") +
expand_limits(y = max(df_plot$percent, na.rm = TRUE) + 8) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(title = title_text, x = "", y = "% of respondents") +
theme(plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11))
}
p1 <- plot_t1_question(df_status, col_help_choices,
"T1: Nutrition Material Helped Make Healthier Food Choices",
cleaner_fun = clean_impact)
p2 <- plot_t1_question(df_status, col_conf_food,
"T1: Confidence in Making Food Choices to Control A1C",
cleaner_fun = clean_confidence)
p3 <- plot_t1_question(df_status, col_conf_labels,
"T1: Confidence Understanding Food Labels / Nutrition Info",
cleaner_fun = clean_confidence)
p4 <- plot_t1_question(df_status, col_conf_prepare,
"T1: Confidence Preparing Healthy Meals at Home",
cleaner_fun = clean_confidence)
print(p1); print(p2); print(p3); print(p4)




impact_tab <- df_status %>%
transmute(response = clean_impact(.data[[col_help_control]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(response, "\n", percent, "% (n=", n, ")")
)
ggplot(impact_tab, aes(x = "", y = n, fill = response)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(
aes(label = paste0(percent, "%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "white"
) +
scale_fill_manual(
values = c(
"A lot" = "#1F4E79",
"Somewhat" = "#2E86C1",
"A little" = "#5DADE2",
"Yes" = "#A9CCE3"
)
) +
theme_void() +
labs(
title = "T1: Program Helped Patient Feel More in Control of Diabetes",
fill = ""
) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "right"
)

# =========================
# MEALS MADE USING PROGRAM FOOD — T1 + T2
# ---- EDIT if needed to match names(df_status) ----
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
# ---- safety check ----
need_cols <- c(col_meals_t1, col_meals_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ", paste(missing, collapse = "\n - "),
"\n\nFix col_meals_t1 / col_meals_t2 using names(df_status).")
}
# ---- robust cleaner (works for BOTH T1 + T2) ----
clean_meals <- function(x) {
s <- str_to_lower(str_squish(as.character(x)))
# convert text numbers to digits
s <- str_replace_all(s, c(
"zero" = "0",
"one" = "1",
"two" = "2",
"three" = "3",
"four" = "4",
"five" = "5",
"six" = "6",
"seven" = "7",
"eight" = "8"
))
case_when(
# missing
is.na(s) ~ NA_character_,
s %in% c("", "na", "n/a", "null", "unsure of #", "unsure") ~ NA_character_,
# ran out = 0
str_detect(s, "ran out") ~ "0",
# explicit 0 / 1
str_detect(s, "^0$") ~ "0",
str_detect(s, "^1$") ~ "1",
# 2–3 bucket
str_detect(s, "2-3") | str_detect(s, "2–3") | str_detect(s, "2 to 3") |
str_detect(s, "^2$") | str_detect(s, "^3$") |
str_detect(s, "2 a day") | str_detect(s, "3 a day") ~ "2–3",
# 4–6 bucket
str_detect(s, "4-6") | str_detect(s, "4 to 6") |
str_detect(s, "4 to 5") |
str_detect(s, "3-4") | str_detect(s, "3 to 4") |
str_detect(s, "^4$") | str_detect(s, "^5$") | str_detect(s, "^6$") |
str_detect(s, "5-6") ~ "4–6",
# 7+
str_detect(s, "7") | str_detect(s, "8") ~ "7+",
TRUE ~ NA_character_
)
}
# ---- plotting helper ----
plot_meals <- function(df, col_name, title_text, y_max = 60) {
tab <- df %>%
transmute(meals = clean_meals(.data[[col_name]])) %>%
filter(!is.na(meals)) %>%
count(meals, name = "n") %>%
mutate(
meals = factor(meals, levels = c("0", "1", "2–3", "4–6", "7+")),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "% (n=", n, ")")
) %>%
arrange(meals)
ggplot(tab, aes(x = meals, y = percent)) +
geom_col(fill = "#1F4E79", width = 0.7) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_y_continuous(
limits = c(0, y_max), # <-- FIXED Y LIMIT
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = title_text,
x = "Number of meals",
y = "% of respondents"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11)
)
}
p_meals_t1 <- plot_meals(df_status, col_meals_t1,
"Meals Prepared Using Program Food (Past Week) — T1",
y_max = 60)
p_meals_t2 <- plot_meals(df_status, col_meals_t2,
"Meals Prepared Using Program Food (Past Week) — T2",
y_max = 60)
print(p_meals_t1)

print(p_meals_t2)

# =========================
# EASE OF USING PROGRAM FOOD IN MEALS (T1)
# =========================
# ---- EDIT to match names(df_status) if needed ----
col_ease_t1 <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2 <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
if (!col_ease_t1 %in% names(df_status)) {
stop("Column not found: ", col_ease_t1,
"\n\nFind the right one with: names(df_status)[str_detect(names(df_status), 'easy')]")
}
# ---- cleaner ----
clean_ease <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric scale variants (adjust if your coding differs)
s %in% c("1") ~ "Very easy",
s %in% c("2") ~ "Easy",
s %in% c("3") ~ "Neutral",
# text variants
str_detect(s, "VERY EASY") ~ "Very easy",
s == "EASY" ~ "Easy",
str_detect(s, "NEUTRAL") ~ "Neutral",
TRUE ~ NA_character_
)
}
ease_summary <- df_status %>%
transmute(response = clean_ease(.data[[col_ease_t1]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(
response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "% (n=", n, ")")
) %>%
arrange(response)
ease_summary_t2 <- df_status %>%
transmute(response = clean_ease(.data[[col_ease_t2]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(
response = factor(response,
levels = c("Neutral", "Easy", "Very easy"),
ordered = TRUE),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "% (n=", n, ")")
) %>%
arrange(response)
ggplot(ease_summary, aes(x = response, y = percent)) +
geom_col(fill = "#2E86C1", width = 0.7) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.12))) +
theme_bw() +
labs(
title = "Ease of Using Program Food in Meals (T1)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

ggplot(ease_summary_t2, aes(x = response, y = percent)) +
geom_col(fill = "#2E86C1", width = 0.7) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_y_continuous(
limits = c(0, 60), # adjust if needed to match T1
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Ease of Using Program Food in Meals (T2)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

# 1. Identify the T1 waste column explicitly
col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
# ---- Safety check ----
need_cols <- c(col_waste_t1, col_waste_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ",
paste(missing, collapse = "\n - "),
"\n\nRun names(df_status) and correct the column names above.")
}
# ---- Clean Yes / No properly (ignore blanks & non-surveyed) ----
clean_yesno <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
s %in% c("YES", "Y") ~ "Yes",
s %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
}
# ---- Function to build summary + plot ----
plot_waste <- function(df, col_name, title_text) {
tab <- df %>%
transmute(Response = clean_yesno(.data[[col_name]])) %>%
filter(!is.na(Response)) %>%
count(Response, name = "n") %>%
mutate(
Response = factor(Response, levels = c("No", "Yes")),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%\n(n=", n, ")")
)
ggplot(tab, aes(x = Response, y = percent, fill = Response)) +
geom_col(width = 0.6) +
geom_text(aes(label = label), vjust = -0.3, size = 4) +
scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
scale_y_continuous(limits = c(0, 100),
labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = title_text,
x = "",
y = "% of respondents"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)
}
# ---- Draw both graphs ----
p_waste_t1 <- plot_waste(df_status, col_waste_t1,
"Did Patients Throw Away Delivered Food? (T1)")
p_waste_t2 <- plot_waste(df_status, col_waste_t2,
"Did Patients Throw Away Delivered Food? (T2)")
print(p_waste_t1)

print(p_waste_t2)

# =========================
# SLIDE: SAMPLE SNAPSHOT
# =========================
sample_snapshot <- df_status %>%
count(enrolled_group, cohort_final, name = "N") %>%
arrange(enrolled_group, cohort_final)
sample_snapshot
## # A tibble: 4 × 3
## enrolled_group cohort_final N
## <chr> <chr> <int>
## 1 Enrolled Cohort 1 60
## 2 Enrolled Cohort 2 41
## 3 Enrolled Cohort 3 35
## 4 Not enrolled Not enrolled 817
# =========================
# SLIDE: DENOMINATORS (T1/T2 response counts)
# Uses your existing is_nonblank()
# =========================
count_answered <- function(df, col) {
tibble(
column = col,
answered_n = sum(is_nonblank(df[[col]])),
total_n = nrow(df),
answered_pct = round(100 * answered_n / total_n, 1)
)
}
# list the columns you’re actually plotting in the deck
deck_cols <- c(
# A1c change fields you already defined earlier
a1c_t1_col, a1c_t2_col,
# utilization
col_ed_base, col_ed_t1, col_ed_t2,
col_ip_base, col_ip_t1, col_ip_t2,
# engagement + experience
col_meals_t1, col_meals_t2,
col_ease_t1, col_ease_t2,
col_t1_manage,
# confidence/help
col_help_choices, col_conf_food, col_conf_labels, col_conf_prepare, col_help_control
)
missing_deck_cols <- setdiff(deck_cols, names(df_status))
if (length(missing_deck_cols) > 0) {
stop("Deck columns missing:\n - ", paste(missing_deck_cols, collapse = "\n - "))
}
denoms_all <- bind_rows(lapply(deck_cols, function(cc) count_answered(df_status, cc))) %>%
arrange(desc(answered_n))
denoms_all
## # A tibble: 18 × 4
## column answered_n total_n answered_pct
## <chr> <int> <int> <dbl>
## 1 ed_vis_last_90_days 953 953 100
## 2 hosp_last_90_days 953 953 100
## 3 ed_utilization_t1 107 953 11.2
## 4 ip_utilization_t1 107 953 11.2
## 5 t1_in_the_past_week_how_many_meals_did_you_m… 66 953 6.9
## 6 t1_in_the_past_week_how_easy_was_it_to_use_t… 64 953 6.7
## 7 t1_how_confident_are_you_in_making_food_choi… 64 953 6.7
## 8 t1_how_confident_are_you_in_understanding_fo… 64 953 6.7
## 9 t1_how_much_did_this_program_help_you_feel_m… 64 953 6.7
## 10 t1_did_the_nutrition_material_you_got_from_w… 61 953 6.4
## 11 t1_how_confident_are_you_in_your_ability_to_… 61 953 6.4
## 12 hospital_utilization_t2 60 953 6.3
## 13 ed_utilization_t2 59 953 6.2
## 14 a1c_change_t1 55 953 5.8
## 15 t2_in_the_past_week_how_many_meals_did_you_m… 43 953 4.5
## 16 t2_in_the_past_week_how_easy_was_it_to_use_t… 42 953 4.4
## 17 t1_have_you_done_anything_in_the_past_week_t… 40 953 4.2
## 18 a1c_change_t2 31 953 3.3
denoms_by_cohort <- function(df, cols) {
bind_rows(lapply(cols, function(cc) {
df %>%
group_by(cohort_final) %>%
summarise(
column = cc,
answered_n = sum(is_nonblank(.data[[cc]])),
total_n = n(),
answered_pct = round(100 * answered_n / total_n, 1),
.groups = "drop"
)
}))
}
denoms_by_cohort(df_status, deck_cols)
## # A tibble: 17,154 × 5
## cohort_final column answered_n total_n answered_pct
## <chr> <chr> <int> <int> <dbl>
## 1 Cohort 1 Abdominal Pain; Vomiting; Diarr… 29 60 48.3
## 2 Cohort 1 Flank Pain; Difficulty Urinating 29 60 48.3
## 3 Cohort 1 <NA> 29 60 48.3
## 4 Cohort 1 Rash 29 60 48.3
## 5 Cohort 1 Blood in Urine 29 60 48.3
## 6 Cohort 1 Fever; Post-op Problem 29 60 48.3
## 7 Cohort 1 Irregular Heart Beat 29 60 48.3
## 8 Cohort 1 Foot Swelling; Wound Check 29 60 48.3
## 9 Cohort 1 Chest Pain 29 60 48.3
## 10 Cohort 1 Hyperglycemia; Nausea; Diarrhea 29 60 48.3
## # ℹ 17,144 more rows
# =========================
# SLIDE: A1c CHANGE SUMMARY TABLES
# Uses your existing parse_delta()
# =========================
a1c_change_summary <- function(df_in, delta_col, label) {
d <- df_in %>%
transmute(delta = parse_delta(.data[[delta_col]])) %>%
filter(!is.na(delta)) %>%
mutate(
category = case_when(
delta <= -1.0 ~ "Improved (≥1.0% ↓)",
delta >= 1.0 ~ "Worsened (≥1.0% ↑)",
TRUE ~ "No meaningful change (<1.0%)"
),
category = factor(category, levels = c(
"Improved (≥1.0% ↓)",
"No meaningful change (<1.0%)",
"Worsened (≥1.0% ↑)"
))
)
tab <- d %>%
count(category, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
N_total = sum(n),
panel = label
)
tab
}
a1c_t1_sum <- a1c_change_summary(
df_status %>% filter(cohort_final %in% c("Cohort 1", "Cohort 2")),
a1c_t1_col,
"T1: Cohort 1 + 2"
)
a1c_t2_sum <- a1c_change_summary(
df_status %>% filter(cohort_final == "Cohort 1"),
a1c_t2_col,
"T2: Cohort 1"
)
bind_rows(a1c_t1_sum, a1c_t2_sum)
## # A tibble: 6 × 5
## category n pct N_total panel
## <fct> <int> <dbl> <int> <chr>
## 1 Improved (≥1.0% ↓) 32 62.7 51 T1: Cohort 1 + 2
## 2 No meaningful change (<1.0%) 17 33.3 51 T1: Cohort 1 + 2
## 3 Worsened (≥1.0% ↑) 2 3.9 51 T1: Cohort 1 + 2
## 4 Improved (≥1.0% ↓) 19 63.3 30 T2: Cohort 1
## 5 No meaningful change (<1.0%) 9 30 30 T2: Cohort 1
## 6 Worsened (≥1.0% ↑) 2 6.7 30 T2: Cohort 1
# =========================
# SLIDE: UTILIZATION CHANGE TABLE (Baseline -> T1), Enrolled
# Uses df_util_pairs you already create
# =========================
util_change_summary <- function(df, base_var, t1_var, label) {
df %>%
transmute(
domain = label,
change = case_when(
.data[[t1_var]] < .data[[base_var]] ~ "Less",
.data[[t1_var]] > .data[[base_var]] ~ "More",
TRUE ~ "No change"
)
) %>%
count(domain, change, name = "n") %>%
group_by(domain) %>%
mutate(
pct = round(100 * n / sum(n), 1),
N = sum(n)
) %>%
ungroup() %>%
mutate(change = factor(change, levels = c("Less", "No change", "More")))
}
ed_change_sum <- util_change_summary(df_util_pairs, "ED_base", "ED_T1", "ED visits")
ip_change_sum <- util_change_summary(df_util_pairs, "IP_base", "IP_T1", "IP stays")
bind_rows(ed_change_sum, ip_change_sum)
## # A tibble: 6 × 5
## domain change n pct N
## <chr> <fct> <int> <dbl> <int>
## 1 ED visits Less 58 57.4 101
## 2 ED visits More 9 8.9 101
## 3 ED visits No change 34 33.7 101
## 4 IP stays Less 62 61.4 101
## 5 IP stays More 6 5.9 101
## 6 IP stays No change 33 32.7 101
# =========================
# COHORT 1 — HbA1c SLIDE-READY STATS TABLE
# Uses existing: df_c1, clean_a1c(), col_a1c_all, col_a1c_t1, col_a1c_t2
# =========================
stopifnot(exists("df_c1"))
need_cols <- c(col_a1c_all, col_a1c_t1, col_a1c_t2)
missing <- setdiff(need_cols, names(df_c1))
if (length(missing) > 0) {
stop("Missing HbA1c columns in df_c1:\n - ", paste(missing, collapse = "\n - "))
}
# ---- Build Cohort 1 HbA1c numeric fields ----
a1c_c1 <- df_c1 %>%
transmute(
id = row_number(),
a1c_base = clean_a1c(.data[[col_a1c_all]]),
a1c_t1 = clean_a1c(.data[[col_a1c_t1]]),
a1c_t2 = clean_a1c(.data[[col_a1c_t2]])
)
# ---- helpers for clean formatting ----
fmt_mean_sd <- function(x, digits = 2) {
x <- x[!is.na(x)]
if (length(x) == 0) return("-")
paste0(
format(round(mean(x), digits), nsmall = digits),
" ± ",
format(round(sd(x), digits), nsmall = digits)
)
}
fmt_num <- function(x, digits = 2) {
if (is.na(x)) return(NA_character_)
format(round(x, digits), nsmall = digits)
}
fmt_p <- function(p) {
if (is.na(p)) return(NA_character_)
if (p < 0.001) return("<0.001")
format(round(p, 3), nsmall = 3)
}
# ---- one comparison -> one clean row ----
a1c_compare_row <- function(df, x_var, y_var, label, digits = 2) {
d <- df %>%
transmute(x = .data[[x_var]], y = .data[[y_var]]) %>%
filter(!is.na(x) & !is.na(y)) %>%
mutate(diff = y - x)
n <- nrow(d)
if (n < 3) {
return(tibble(
Comparison = label,
`n (paired)` = n,
`Baseline mean ± SD` = "-",
`Follow-up mean ± SD` = "-",
`Mean Δ (pp) [95% CI]` = "Not enough paired data",
`Median Δ (pp)` = "-",
`p (paired t)` = "-"
))
}
tt <- t.test(d$y, d$x, paired = TRUE)
tibble(
Comparison = label,
`n (paired)` = n,
`Baseline mean ± SD` = fmt_mean_sd(d$x, digits),
`Follow-up mean ± SD` = fmt_mean_sd(d$y, digits),
`Mean Δ (pp) [95% CI]` = paste0(
fmt_num(mean(d$diff), digits),
" [",
fmt_num(tt$conf.int[1], digits),
", ",
fmt_num(tt$conf.int[2], digits),
"]"
),
`Median Δ (pp)` = fmt_num(median(d$diff), digits),
`p (paired t)` = fmt_p(tt$p.value)
)
}
# ---- MAIN SLIDE TABLE (clean) ----
a1c_slide_table_c1 <- bind_rows(
a1c_compare_row(a1c_c1, "a1c_base", "a1c_t1", "Cohort 1: Baseline → T1"),
a1c_compare_row(a1c_c1, "a1c_base", "a1c_t2", "Cohort 1: Baseline → T2"),
a1c_compare_row(a1c_c1, "a1c_t1", "a1c_t2", "Cohort 1: T1 → T2")
)
a1c_slide_table_c1
## # A tibble: 3 × 7
## Comparison `n (paired)` `Baseline mean ± SD` `Follow-up mean ± SD`
## <chr> <int> <chr> <chr>
## 1 Cohort 1: Baseline → … 29 10.24 ± 2.40 7.91 ± 1.81
## 2 Cohort 1: Baseline → … 30 10.62 ± 2.03 8.57 ± 2.27
## 3 Cohort 1: T1 → T2 18 7.82 ± 1.72 7.68 ± 1.94
## # ℹ 3 more variables: `Mean Δ (pp) [95% CI]` <chr>, `Median Δ (pp)` <chr>,
## # `p (paired t)` <chr>
# Optional: nice rendering in Rmd
# knitr::kable(a1c_slide_table_c1, caption = "Cohort 1 HbA1c change (paired)")
# ---- OPTIONAL APPENDIX TABLE (diagnostics + nonparametric) ----
a1c_diag_row <- function(df, x_var, y_var, label) {
d <- df %>%
transmute(x = .data[[x_var]], y = .data[[y_var]]) %>%
filter(!is.na(x) & !is.na(y)) %>%
mutate(diff = y - x)
n <- nrow(d)
if (n < 3) {
return(tibble(
Comparison = label,
`n (paired)` = n,
`Shapiro p (diff)` = "-",
`p (Wilcoxon signed-rank)` = "-"
))
}
sh <- shapiro.test(d$diff)
wx <- wilcox.test(d$y, d$x, paired = TRUE, exact = FALSE)
tibble(
Comparison = label,
`n (paired)` = n,
`Shapiro p (diff)` = fmt_p(sh$p.value),
`p (Wilcoxon signed-rank)` = fmt_p(wx$p.value)
)
}
a1c_appendix_diag_c1 <- bind_rows(
a1c_diag_row(a1c_c1, "a1c_base", "a1c_t1", "Cohort 1: Baseline → T1"),
a1c_diag_row(a1c_c1, "a1c_base", "a1c_t2", "Cohort 1: Baseline → T2"),
a1c_diag_row(a1c_c1, "a1c_t1", "a1c_t2", "Cohort 1: T1 → T2")
)
a1c_appendix_diag_c1
## # A tibble: 3 × 4
## Comparison `n (paired)` `Shapiro p (diff)` p (Wilcoxon signed-r…¹
## <chr> <int> <chr> <chr>
## 1 Cohort 1: Baseline → T1 29 0.663 <0.001
## 2 Cohort 1: Baseline → T2 30 0.964 <0.001
## 3 Cohort 1: T1 → T2 18 0.017 0.541
## # ℹ abbreviated name: ¹`p (Wilcoxon signed-rank)`
# =========================
# COHORT 1 HbA1c — PAIRED CHANGE PLOT (spaghetti + mean)
# =========================
stopifnot(exists("df_c1"))
a1c_pairs_long <- df_c1 %>%
transmute(
id = row_number(),
Baseline = clean_a1c(.data[[col_a1c_all]]),
T1 = clean_a1c(.data[[col_a1c_t1]]),
T2 = clean_a1c(.data[[col_a1c_t2]])
) %>%
pivot_longer(cols = c(Baseline, T1, T2),
names_to = "Timepoint",
values_to = "HbA1c") %>%
filter(!is.na(HbA1c)) %>%
mutate(
Timepoint = factor(Timepoint, levels = c("Baseline", "T1", "T2"))
)
# Only keep patients who have baseline AND at least one follow-up
a1c_keep_ids <- a1c_pairs_long %>%
group_by(id) %>%
summarize(has_base = any(Timepoint == "Baseline"),
has_fu = any(Timepoint %in% c("T1","T2")),
.groups = "drop") %>%
filter(has_base & has_fu) %>%
pull(id)
a1c_pairs_long <- a1c_pairs_long %>% filter(id %in% a1c_keep_ids)
# mean points per timepoint (computed from the plotted data)
a1c_means <- a1c_pairs_long %>%
group_by(Timepoint) %>%
summarize(
mean_a1c = mean(HbA1c, na.rm = TRUE),
n = n_distinct(id),
.groups = "drop"
)
ggplot(a1c_pairs_long, aes(x = Timepoint, y = HbA1c, group = id)) +
geom_line(alpha = 0.25, linewidth = 0.6) +
geom_point(alpha = 0.55, size = 1.6) +
geom_point(data = a1c_means, aes(x = Timepoint, y = mean_a1c),
inherit.aes = FALSE, size = 3.2) +
geom_line(data = a1c_means, aes(x = Timepoint, y = mean_a1c, group = 1),
inherit.aes = FALSE, linewidth = 1.1) +
labs(
title = "Cohort 1 HbA1c Over Time (Paired Patients)",
subtitle = "Thin lines = individual patients; thick line/dots = mean among plotted patients",
x = NULL,
y = "HbA1c (%)"
) +
theme_bw() +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11)
)

# =========================
# COHORT 1 HbA1c — CHANGE DISTRIBUTION (Δ) with mean + 95% CI
# =========================
a1c_diff_long <- df_c1 %>%
transmute(
id = row_number(),
a1c_base = clean_a1c(.data[[col_a1c_all]]),
a1c_t1 = clean_a1c(.data[[col_a1c_t1]]),
a1c_t2 = clean_a1c(.data[[col_a1c_t2]])
)
# Build paired diffs (follow-up minus baseline)
diff_t1 <- a1c_diff_long %>%
filter(!is.na(a1c_base) & !is.na(a1c_t1)) %>%
transmute(Comparison = "Baseline → T1", diff = a1c_t1 - a1c_base)
diff_t2 <- a1c_diff_long %>%
filter(!is.na(a1c_base) & !is.na(a1c_t2)) %>%
transmute(Comparison = "Baseline → T2", diff = a1c_t2 - a1c_base)
diff_plot <- bind_rows(diff_t1, diff_t2) %>%
mutate(Comparison = factor(Comparison, levels = c("Baseline → T1", "Baseline → T2")))
# Mean + 95% CI per comparison (paired diff CI)
diff_ci <- diff_plot %>%
group_by(Comparison) %>%
summarize(
n = n(),
mean_diff = mean(diff, na.rm = TRUE),
tt = list(t.test(diff, mu = 0)),
lo = tt[[1]]$conf.int[1],
hi = tt[[1]]$conf.int[2],
.groups = "drop"
)
ggplot(diff_plot, aes(x = Comparison, y = diff)) +
geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.7) +
geom_boxplot(width = 0.35, outlier.shape = NA, alpha = 0.25) +
geom_jitter(width = 0.12, height = 0, alpha = 0.55, size = 1.6) +
geom_point(data = diff_ci, aes(x = Comparison, y = mean_diff),
inherit.aes = FALSE, size = 3.0) +
geom_errorbar(data = diff_ci, aes(x = Comparison, ymin = lo, ymax = hi),
inherit.aes = FALSE, width = 0.10, linewidth = 0.9) +
labs(
title = "Cohort 1 HbA1c Change (Δ) with 95% CI",
subtitle = "Δ = follow-up HbA1c − baseline HbA1c (negative = improvement)",
x = NULL,
y = "HbA1c change (percentage points)"
) +
theme_bw() +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11)
)

# =========================
# FIND DISCONTINUATION FIELDS (by column name)
# =========================
kw <- c(
"discont", "discharg", "drop", "withdraw", "declin", "expire", "deceas",
"status", "active", "inactive", "program", "enroll", "end", "stop", "reason", "closed"
)
cand_cols <- names(df_status)[
Reduce(`|`, lapply(kw, function(k) str_detect(names(df_status), fixed(k))))
]
cand_cols
## [1] "pre_enrollment_notes"
## [2] "enrolled"
## [3] "discharge_disposition"
## [4] "pt_portal_status"
## [5] "ps_in_the_past_12_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
## [6] "pre_screen_interested_in_program"
## [7] "screen_for_enrollment_interested_in_program"
## [8] "t0_since_your_discharge_from_the_hospital_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [9] "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
## [10] "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
## [11] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_58"
## [12] "t1_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"
## [13] "t2_in_the_past_6_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
## [14] "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
## [15] "t2_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
## [16] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_81"
## [17] "t2_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"
## [18] "enrolled_clean"
## [19] "enrolled_group"
# ---- Total counts table ----
total_counts <- df_status %>%
summarise(
Enrolled = sum(enrolled_group == "Enrolled", na.rm = TRUE),
Cohort_1 = sum(cohort_final == "Cohort 1", na.rm = TRUE),
Cohort_2 = sum(cohort_final == "Cohort 2", na.rm = TRUE),
Cohort_3 = sum(cohort_final == "Cohort 3", na.rm = TRUE)
)
total_counts
## # A tibble: 1 × 4
## Enrolled Cohort_1 Cohort_2 Cohort_3
## <int> <int> <int> <int>
## 1 136 60 41 35
parse_delta <- function(x) {
suppressWarnings(as.numeric(stringr::str_replace_all(
stringr::str_squish(as.character(x)),
"[^0-9.\\-]", ""
)))
}
a1c_cards <- function(data, delta_col, cohort_filter, label) {
d <- data %>%
filter(!!rlang::parse_expr(cohort_filter)) %>%
transmute(delta = parse_delta(.data[[delta_col]])) %>%
filter(!is.na(delta)) %>%
mutate(
bucket = dplyr::case_when(
delta <= -1.0 ~ "Improved",
delta >= 1.0 ~ "Worsened",
TRUE ~ "No change"
)
)
out <- d %>%
count(bucket, name = "n") %>%
mutate(
N = sum(n),
pct = round(100 * n / N, 1)
) %>%
arrange(factor(bucket, levels = c("Improved","No change","Worsened"))) %>%
mutate(Slide = label) %>%
select(Slide, bucket, n, N, pct)
out
}
# ---- VERIFY: A1C Change at T1: Cohort 1 + Cohort 2 (n should be 51 on slide) ----
a1c_t1_verify <- a1c_cards(
df_status,
delta_col = a1c_t1_col,
cohort_filter = "cohort_final %in% c('Cohort 1','Cohort 2')",
label = "A1C Change (T1): Cohort 1 + Cohort 2"
)
a1c_t1_verify
## # A tibble: 3 × 5
## Slide bucket n N pct
## <chr> <chr> <int> <int> <dbl>
## 1 A1C Change (T1): Cohort 1 + Cohort 2 Improved 32 51 62.7
## 2 A1C Change (T1): Cohort 1 + Cohort 2 No change 17 51 33.3
## 3 A1C Change (T1): Cohort 1 + Cohort 2 Worsened 2 51 3.9
# ---- VERIFY: A1C Change at T2: Cohort 1 only (n should be 30 on slide) ----
a1c_t2_verify <- a1c_cards(
df_status,
delta_col = a1c_t2_col,
cohort_filter = "cohort_final == 'Cohort 1'",
label = "A1C Change (T2): Cohort 1"
)
a1c_t2_verify
## # A tibble: 3 × 5
## Slide bucket n N pct
## <chr> <chr> <int> <int> <dbl>
## 1 A1C Change (T2): Cohort 1 Improved 19 30 63.3
## 2 A1C Change (T2): Cohort 1 No change 9 30 30
## 3 A1C Change (T2): Cohort 1 Worsened 2 30 6.7
# =========================
# A1C (PDSA language, cohort-specific ENROLLED stats)
# Uses your existing make_a1c_plots() + a1c_change_* columns
# Does NOT modify df_status
# =========================
stopifnot(exists("df_status"))
stopifnot(exists("make_a1c_plots"))
stopifnot("enrolled_group" %in% names(df_status))
stopifnot("cohort_final" %in% names(df_status))
# ---- ENROLLED-only analysis frame (new object; df_status untouched) ----
df_enrolled <- df_status %>% dplyr::filter(enrolled_group == "Enrolled")
# ---- Map your existing delta columns to how YOU will speak (PDSA) ----
# IMPORTANT: we are NOT re-computing deltas here; we are just relabeling.
# Adjust labels if needed, but keep the columns.
cycle_map <- list(
# You said: Cohort 2 uses "t0->t1" in the Nov→Feb window.
# In your current dataset, that delta is stored in a1c_change_t1.
PDSA2_C2 = list(
cohort = "Cohort 2",
delta_col = a1c_t1_col,
title = "A1C Change — PDSA Cycle 2 (Nov→Feb): Cohort 2 (ENROLLED)"
),
# Cohort 1: you want to discuss baseline→3mo (Aug→Oct) AND later (Nov→Feb).
# With your current objects, you only have two delta fields: a1c_change_t1 and a1c_change_t2.
# Use whichever your chart reviewer intended:
# - a1c_change_t1 = earlier delta
# - a1c_change_t2 = later / most recent delta
PDSA1_C1 = list(
cohort = "Cohort 1",
delta_col = a1c_t1_col,
title = "A1C Change — PDSA Cycle 1 (Aug→Oct): Cohort 1 (ENROLLED)"
),
PDSA2_C1 = list(
cohort = "Cohort 1",
delta_col = a1c_t2_col,
title = "A1C Change — PDSA Cycle 2 (Nov→Feb): Cohort 1 (ENROLLED)"
)
)
# ---- Safety checks for the mapped columns ----
need_delta_cols <- unique(vapply(cycle_map, function(x) x$delta_col, character(1)))
missing_delta_cols <- setdiff(need_delta_cols, names(df_status))
if (length(missing_delta_cols) > 0) {
stop(
"Missing A1C delta columns in df_status:\n - ",
paste(missing_delta_cols, collapse = "\n - "),
"\n\nFix your a1c_t*_col variables or cycle_map."
)
}
# ---- Run each cohort/cycle using your existing plot function ----
run_cycle <- function(df_in, cohort_name, delta_col, title_text) {
df_cycle <- df_in %>% dplyr::filter(cohort_final == cohort_name)
out <- make_a1c_plots(df_cycle, delta_col, title_text)
list(tab = out$tab, bar = out$bar, dist = out$dist)
}
# Cohort 1 PDSA1 (uses a1c_change_t1 as stored)
res_pdsa1_c1 <- run_cycle(df_enrolled, cycle_map$PDSA1_C1$cohort,
cycle_map$PDSA1_C1$delta_col, cycle_map$PDSA1_C1$title)
# Cohort 1 PDSA2 (uses a1c_change_t2 as stored)
res_pdsa2_c1 <- run_cycle(df_enrolled, cycle_map$PDSA2_C1$cohort,
cycle_map$PDSA2_C1$delta_col, cycle_map$PDSA2_C1$title)
# Cohort 2 PDSA2 (uses a1c_change_t1 as stored)
res_pdsa2_c2 <- run_cycle(df_enrolled, cycle_map$PDSA2_C2$cohort,
cycle_map$PDSA2_C2$delta_col, cycle_map$PDSA2_C2$title)
# ---- Outputs (tables + slide-ready bar charts) ----
res_pdsa1_c1$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 19 67.9 67.9% (n=19)
## 2 No meaningful change 8 28.6 28.6% (n=8)
## 3 Worsened (≥1.0%) 1 3.6 3.6% (n=1)
res_pdsa1_c1$bar

res_pdsa2_c1$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 19 63.3 63.3% (n=19)
## 2 No meaningful change 9 30 30% (n=9)
## 3 Worsened (≥1.0%) 2 6.7 6.7% (n=2)
res_pdsa2_c1$bar

res_pdsa2_c2$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 13 56.5 56.5% (n=13)
## 2 No meaningful change 9 39.1 39.1% (n=9)
## 3 Worsened (≥1.0%) 1 4.3 4.3% (n=1)
res_pdsa2_c2$bar

# ---- helpers (do NOT modify df_status) ----
is_nonblank <- function(x) {
x_chr <- as.character(x)
x_std <- stringr::str_to_upper(stringr::str_squish(x_chr))
!(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}
pct <- function(n, d) ifelse(d == 0, NA_real_, round(100 * n / d, 1))
parse_num <- function(x) {
suppressWarnings(as.numeric(stringr::str_replace_all(
stringr::str_squish(as.character(x)),
"[^0-9.\\-]", ""
)))
}
parse_count <- function(x) {
s <- stringr::str_squish(as.character(x))
s[stringr::str_to_upper(s) %in% c("", "NA","N/A","NULL")] <- NA_character_
suppressWarnings(as.numeric(stringr::str_replace_all(s, "[^0-9\\-\\.]", "")))
}
clean_yesno <- function(x) {
s <- stringr::str_to_upper(stringr::str_squish(as.character(x)))
dplyr::case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA","N/A","NULL") ~ NA_character_,
s %in% c("Y","YES") ~ "Yes",
s %in% c("N","NO") ~ "No",
TRUE ~ NA_character_
)
}
race_col <- "race"
eth_col <- "ethnicity"
sex_col <- "legal_sex"
demo_ns <- df_status %>%
dplyr::summarise(
all_race_n = sum(is_nonblank(.data[[race_col]])),
enr_race_n = sum(enrolled_group == "Enrolled" & is_nonblank(.data[[race_col]])),
all_eth_n = sum(is_nonblank(.data[[eth_col]])),
enr_eth_n = sum(enrolled_group == "Enrolled" & is_nonblank(.data[[eth_col]])),
all_sex_n = sum(is_nonblank(.data[[sex_col]])),
enr_sex_n = sum(enrolled_group == "Enrolled" & is_nonblank(.data[[sex_col]]))
)
demo_ns
## # A tibble: 1 × 6
## all_race_n enr_race_n all_eth_n enr_eth_n all_sex_n enr_sex_n
## <int> <int> <int> <int> <int> <int>
## 1 953 136 953 136 953 136
race_map <- c(
"R1" = "American Indian / Alaska Native",
"R2" = "Asian",
"R3" = "Black or African American",
"R4" = "Native Hawaiian / Pacific Islander",
"R5" = "White",
"R9" = "Other"
)
race_long <- df_status %>%
dplyr::transmute(
enrolled_group,
Race_raw = stringr::str_squish(as.character(.data[[race_col]]))
) %>%
dplyr::filter(is_nonblank(Race_raw)) %>%
dplyr::mutate(Race_raw = stringr::str_split(Race_raw, "\\r?\\n")) %>%
tidyr::unnest(Race_raw) %>%
dplyr::mutate(
Race_code = stringr::str_extract(Race_raw, "^R\\d+"),
Race_clean = unname(race_map[Race_code])
) %>%
dplyr::filter(!is.na(Race_clean))
race_tab <- dplyr::bind_rows(
race_long %>% dplyr::mutate(Group = "All Patients"),
race_long %>% dplyr::filter(enrolled_group == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
dplyr::count(Group, Race_clean, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(
N = sum(n),
percent = pct(n, N)
) %>%
dplyr::ungroup()
# If the slide only shows these 4:
race_slide <- race_tab %>%
dplyr::filter(Race_clean %in% c("Black or African American","Other","White","Asian")) %>%
dplyr::arrange(dplyr::desc(Group), dplyr::desc(percent))
race_slide
## # A tibble: 8 × 5
## Group Race_clean n N percent
## <chr> <chr> <int> <int> <dbl>
## 1 Enrolled Black or African American 57 135 42.2
## 2 Enrolled Other 48 135 35.6
## 3 Enrolled White 27 135 20
## 4 Enrolled Asian 2 135 1.5
## 5 All Patients White 379 924 41
## 6 All Patients Other 261 924 28.2
## 7 All Patients Black or African American 248 924 26.8
## 8 All Patients Asian 30 924 3.2
eth_tab_full <- df_status %>%
dplyr::transmute(
Group = dplyr::if_else(enrolled_group == "Enrolled", "Enrolled", "All Patients"),
Eth_raw = stringr::str_to_upper(stringr::str_squish(as.character(.data[[eth_col]]))),
Eth_clean = dplyr::case_when(
stringr::str_detect(Eth_raw, "E1") ~ "Spanish/Hispanic/Latino",
stringr::str_detect(Eth_raw, "E2") ~ "Not Hispanic/Latino",
TRUE ~ "Missing/Declined/Unknown"
)
) %>%
dplyr::count(Group, Eth_clean, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1)
) %>%
dplyr::ungroup()
eth_tab_full
## # A tibble: 6 × 5
## Group Eth_clean n N percent
## <chr> <chr> <int> <int> <dbl>
## 1 All Patients Missing/Declined/Unknown 37 817 4.5
## 2 All Patients Not Hispanic/Latino 556 817 68.1
## 3 All Patients Spanish/Hispanic/Latino 224 817 27.4
## 4 Enrolled Missing/Declined/Unknown 1 136 0.7
## 5 Enrolled Not Hispanic/Latino 83 136 61
## 6 Enrolled Spanish/Hispanic/Latino 52 136 38.2
sex_base <- df_status %>%
dplyr::transmute(
enrolled_group,
Sex = stringr::str_squish(as.character(.data[[sex_col]]))
) %>%
dplyr::filter(is_nonblank(Sex))
sex_tab <- dplyr::bind_rows(
sex_base %>% dplyr::mutate(Group = "All Patients"),
sex_base %>% dplyr::filter(enrolled_group == "Enrolled") %>%
dplyr::mutate(Group = "Enrolled")
) %>%
dplyr::count(Group, Sex, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1)
) %>%
dplyr::ungroup()
sex_tab
## # A tibble: 4 × 5
## Group Sex n N percent
## <chr> <chr> <int> <int> <dbl>
## 1 All Patients Female 414 953 43.4
## 2 All Patients Male 539 953 56.6
## 3 Enrolled Female 74 136 54.4
## 4 Enrolled Male 62 136 45.6
# ============================================================
# FOOD UTILIZATION + WASTE (T1 + T2)
# Uses existing df_status object (DO NOT redefine df_status)
# Focus: ENROLLED patients, stratified by cohort_final
# ============================================================
# ---- Dependencies assumed already loaded: dplyr, stringr, tidyr, ggplot2 ----
# -------------------------
# 0) Reusable helpers (define ONCE)
# -------------------------
is_nonblank <- function(x) {
x_chr <- as.character(x)
x_std <- stringr::str_to_upper(stringr::str_squish(x_chr))
!(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}
clean_yesno <- function(x) {
s <- stringr::str_to_upper(stringr::str_squish(as.character(x)))
dplyr::case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
s %in% c("YES", "Y") ~ "Yes",
s %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
}
clean_meals <- function(x) {
s <- stringr::str_to_lower(stringr::str_squish(as.character(x)))
# text -> digits
s <- stringr::str_replace_all(s, c(
"zero" = "0",
"one" = "1",
"two" = "2",
"three" = "3",
"four" = "4",
"five" = "5",
"six" = "6",
"seven" = "7",
"eight" = "8"
))
dplyr::case_when(
is.na(s) ~ NA_character_,
s %in% c("", "na", "n/a", "null", "unsure of #", "unsure") ~ NA_character_,
stringr::str_detect(s, "ran out") ~ "0",
stringr::str_detect(s, "^0$") ~ "0",
stringr::str_detect(s, "^1$") ~ "1",
stringr::str_detect(s, "2-3") | stringr::str_detect(s, "2–3") | stringr::str_detect(s, "2 to 3") |
stringr::str_detect(s, "^2$") | stringr::str_detect(s, "^3$") |
stringr::str_detect(s, "2 a day") | stringr::str_detect(s, "3 a day") ~ "2–3",
stringr::str_detect(s, "4-6") | stringr::str_detect(s, "4 to 6") |
stringr::str_detect(s, "4 to 5") |
stringr::str_detect(s, "3-4") | stringr::str_detect(s, "3 to 4") |
stringr::str_detect(s, "^4$") | stringr::str_detect(s, "^5$") | stringr::str_detect(s, "^6$") |
stringr::str_detect(s, "5-6") ~ "4–6",
stringr::str_detect(s, "\\b7\\b") | stringr::str_detect(s, "\\b8\\b") ~ "7+",
TRUE ~ NA_character_
)
}
clean_ease <- function(x) {
s <- stringr::str_to_upper(stringr::str_squish(as.character(x)))
dplyr::case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric variants
s %in% c("1") ~ "Very easy",
s %in% c("2") ~ "Easy",
s %in% c("3") ~ "Neutral",
# text variants
stringr::str_detect(s, "VERY EASY") ~ "Very easy",
s == "EASY" ~ "Easy",
stringr::str_detect(s, "NEUTRAL") ~ "Neutral",
TRUE ~ NA_character_
)
}
# -------------------------
# 1) Column names (EDIT ONLY IF NEEDED)
# -------------------------
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_ease_t1 <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2 <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
need_cols <- c(
"enrolled_group", "cohort_final",
col_meals_t1, col_meals_t2,
col_ease_t1, col_ease_t2,
col_waste_t1, col_waste_t2
)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) {
stop("Missing columns:\n - ", paste(missing_cols, collapse = "\n - "))
}
# -------------------------
# 2) Base enrolled dataset (cohort-specific, enrolled only)
# -------------------------
df_enrolled <- df_status %>%
dplyr::filter(enrolled_group == "Enrolled", cohort_final %in% c("Cohort 1", "Cohort 2", "Cohort 3")) %>%
dplyr::mutate(cohort_final = factor(cohort_final, levels = c("Cohort 1", "Cohort 2", "Cohort 3")))
# ============================================================
# A) MEALS MADE USING PROGRAM FOOD (T1 + T2)
# ============================================================
make_meals_tab <- function(df, col_name, time_label) {
df %>%
dplyr::transmute(
cohort_final,
response = clean_meals(.data[[col_name]])
) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::mutate(
response = factor(response, levels = c("0", "1", "2–3", "4–6", "7+"), ordered = TRUE)
) %>%
dplyr::count(cohort_final, response, name = "n") %>%
dplyr::group_by(cohort_final) %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1),
timepoint = time_label
) %>%
dplyr::ungroup()
}
plot_meals_by_cohort <- function(tab, title_text, y_max = 70) {
ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
ggplot2::geom_col(width = 0.75, fill = "#1F4E79") +
ggplot2::geom_text(
ggplot2::aes(label = paste0(percent, "%\n(n=", n, ")")),
vjust = -0.25, size = 3.5
) +
ggplot2::facet_wrap(~ cohort_final, nrow = 1) +
ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(title = title_text, x = "Meals using program food (past week)", y = "% within cohort") +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold", size = 14),
axis.text.x = ggplot2::element_text(size = 10)
)
}
meals_t1_tab <- make_meals_tab(df_enrolled, col_meals_t1, "T1")
meals_t2_tab <- make_meals_tab(df_enrolled, col_meals_t2, "T2")
meals_t1_tab
## # A tibble: 8 × 6
## cohort_final response n N percent timepoint
## <fct> <ord> <int> <int> <dbl> <chr>
## 1 Cohort 1 0 3 39 7.7 T1
## 2 Cohort 1 1 1 39 2.6 T1
## 3 Cohort 1 2–3 22 39 56.4 T1
## 4 Cohort 1 4–6 12 39 30.8 T1
## 5 Cohort 1 7+ 1 39 2.6 T1
## 6 Cohort 2 2–3 10 17 58.8 T1
## 7 Cohort 2 4–6 4 17 23.5 T1
## 8 Cohort 2 7+ 3 17 17.6 T1
plot_meals_by_cohort(meals_t1_tab, "Meals Made Using Program Food — T1 (Enrolled, by Cohort)", y_max = 70)

meals_t2_tab
## # A tibble: 5 × 6
## cohort_final response n N percent timepoint
## <fct> <ord> <int> <int> <dbl> <chr>
## 1 Cohort 1 0 3 41 7.3 T2
## 2 Cohort 1 1 1 41 2.4 T2
## 3 Cohort 1 2–3 17 41 41.5 T2
## 4 Cohort 1 4–6 15 41 36.6 T2
## 5 Cohort 1 7+ 5 41 12.2 T2
plot_meals_by_cohort(meals_t2_tab, "Meals Made Using Program Food — T2 (Enrolled, by Cohort)", y_max = 70)

# ============================================================
# B) EASE OF USING PROGRAM FOOD (T1 + T2)
# ============================================================
make_ease_tab <- function(df, col_name, time_label) {
df %>%
dplyr::transmute(
cohort_final,
response = clean_ease(.data[[col_name]])
) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::mutate(
response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE)
) %>%
dplyr::count(cohort_final, response, name = "n") %>%
dplyr::group_by(cohort_final) %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1),
timepoint = time_label
) %>%
dplyr::ungroup()
}
plot_ease_by_cohort <- function(tab, title_text, y_max = 70) {
ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
ggplot2::geom_col(width = 0.75, fill = "#2E86C1") +
ggplot2::geom_text(
ggplot2::aes(label = paste0(percent, "%\n(n=", n, ")")),
vjust = -0.25, size = 3.5
) +
ggplot2::facet_wrap(~ cohort_final, nrow = 1) +
ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(title = title_text, x = "", y = "% within cohort") +
ggplot2::theme(plot.title = ggplot2::element_text(face = "bold", size = 14))
}
ease_t1_tab <- make_ease_tab(df_enrolled, col_ease_t1, "T1")
ease_t2_tab <- make_ease_tab(df_enrolled, col_ease_t2, "T2")
ease_t1_tab
## # A tibble: 6 × 6
## cohort_final response n N percent timepoint
## <fct> <ord> <int> <int> <dbl> <chr>
## 1 Cohort 1 Neutral 3 38 7.9 T1
## 2 Cohort 1 Easy 14 38 36.8 T1
## 3 Cohort 1 Very easy 21 38 55.3 T1
## 4 Cohort 2 Neutral 2 24 8.3 T1
## 5 Cohort 2 Easy 14 24 58.3 T1
## 6 Cohort 2 Very easy 8 24 33.3 T1
plot_ease_by_cohort(ease_t1_tab, "Ease of Using Program Food in Meals — T1 (Enrolled, by Cohort)", y_max = 70)

ease_t2_tab
## # A tibble: 3 × 6
## cohort_final response n N percent timepoint
## <fct> <ord> <int> <int> <dbl> <chr>
## 1 Cohort 1 Neutral 2 40 5 T2
## 2 Cohort 1 Easy 21 40 52.5 T2
## 3 Cohort 1 Very easy 17 40 42.5 T2
plot_ease_by_cohort(ease_t2_tab, "Ease of Using Program Food in Meals — T2 (Enrolled, by Cohort)", y_max = 70)

# ============================================================
# C) WASTE: THREW AWAY ANY FOOD FROM MOST RECENT DELIVERY (T1 + T2)
# ============================================================
make_waste_tab <- function(df, col_name, time_label) {
df %>%
dplyr::transmute(
cohort_final,
response = clean_yesno(.data[[col_name]])
) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::mutate(response = factor(response, levels = c("No", "Yes"))) %>%
dplyr::count(cohort_final, response, name = "n") %>%
dplyr::group_by(cohort_final) %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1),
timepoint = time_label
) %>%
dplyr::ungroup()
}
plot_waste_by_cohort <- function(tab, title_text) {
ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent, fill = response)) +
ggplot2::geom_col(width = 0.65) +
ggplot2::geom_text(
ggplot2::aes(label = paste0(percent, "%\n(n=", n, ")")),
vjust = -0.25, size = 3.6
) +
ggplot2::facet_wrap(~ cohort_final, nrow = 1) +
ggplot2::scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
ggplot2::scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(title = title_text, x = "", y = "% within cohort") +
ggplot2::theme(
legend.position = "none",
plot.title = ggplot2::element_text(face = "bold", size = 14)
)
}
waste_t1_tab <- make_waste_tab(df_enrolled, col_waste_t1, "T1")
waste_t2_tab <- make_waste_tab(df_enrolled, col_waste_t2, "T2")
waste_t1_tab
## # A tibble: 4 × 6
## cohort_final response n N percent timepoint
## <fct> <fct> <int> <int> <dbl> <chr>
## 1 Cohort 1 No 30 37 81.1 T1
## 2 Cohort 1 Yes 7 37 18.9 T1
## 3 Cohort 2 No 21 26 80.8 T1
## 4 Cohort 2 Yes 5 26 19.2 T1
plot_waste_by_cohort(waste_t1_tab, "Threw Away Any Delivered Food? — T1 (Enrolled, by Cohort)")

waste_t2_tab
## # A tibble: 2 × 6
## cohort_final response n N percent timepoint
## <fct> <fct> <int> <int> <dbl> <chr>
## 1 Cohort 1 No 37 40 92.5 T2
## 2 Cohort 1 Yes 3 40 7.5 T2
plot_waste_by_cohort(waste_t2_tab, "Threw Away Any Delivered Food? — T2 (Enrolled, by Cohort)")

# ============================================================
# FOOD UTILIZATION + WASTE (T1 + T2) — ENROLLED ONLY (NO COHORT)
# ============================================================
# ---- Column names (edit ONLY if needed) ----
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_ease_t1 <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2 <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
need_cols <- c("enrolled_group", col_meals_t1, col_meals_t2, col_ease_t1, col_ease_t2, col_waste_t1, col_waste_t2)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) stop("Missing columns:\n - ", paste(missing_cols, collapse = "\n - "))
# ---- Enrolled only (single object; df_status untouched) ----
df_enrolled <- df_status %>% dplyr::filter(enrolled_group == "Enrolled")
# =========================
# A) MEALS USING PROGRAM FOOD (T1 + T2)
# =========================
make_meals_tab <- function(df, col_name, time_label) {
df %>%
dplyr::transmute(response = clean_meals(.data[[col_name]])) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::mutate(response = factor(response, levels = c("0", "1", "2–3", "4–6", "7+"), ordered = TRUE)) %>%
dplyr::count(response, name = "n") %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1),
timepoint = time_label,
label = paste0(percent, "% (n=", n, ")")
)
}
plot_meals <- function(tab, title_text, y_max = 70) {
ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
ggplot2::geom_col(width = 0.75, fill = "#1F4E79") +
ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.25, size = 3.8) +
ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(title = title_text, x = "Number of meals", y = "% of enrolled respondents") +
ggplot2::theme(plot.title = ggplot2::element_text(face = "bold", size = 14))
}
meals_t1_tab <- make_meals_tab(df_enrolled, col_meals_t1, "T1")
meals_t2_tab <- make_meals_tab(df_enrolled, col_meals_t2, "T2")
meals_t1_tab
## # A tibble: 5 × 6
## response n N percent timepoint label
## <ord> <int> <int> <dbl> <chr> <chr>
## 1 0 3 56 5.4 T1 5.4% (n=3)
## 2 1 1 56 1.8 T1 1.8% (n=1)
## 3 2–3 32 56 57.1 T1 57.1% (n=32)
## 4 4–6 16 56 28.6 T1 28.6% (n=16)
## 5 7+ 4 56 7.1 T1 7.1% (n=4)
plot_meals(meals_t1_tab, paste0("Meals Prepared Using Program Food — T1 (Enrolled, n=", unique(meals_t1_tab$N), ")"), y_max = 70)

meals_t2_tab
## # A tibble: 5 × 6
## response n N percent timepoint label
## <ord> <int> <int> <dbl> <chr> <chr>
## 1 0 3 41 7.3 T2 7.3% (n=3)
## 2 1 1 41 2.4 T2 2.4% (n=1)
## 3 2–3 17 41 41.5 T2 41.5% (n=17)
## 4 4–6 15 41 36.6 T2 36.6% (n=15)
## 5 7+ 5 41 12.2 T2 12.2% (n=5)
plot_meals(meals_t2_tab, paste0("Meals Prepared Using Program Food — T2 (Enrolled, n=", unique(meals_t2_tab$N), ")"), y_max = 70)

# =========================
# B) EASE OF USING PROGRAM FOOD (T1 + T2)
# =========================
make_ease_tab <- function(df, col_name, time_label) {
df %>%
dplyr::transmute(response = clean_ease(.data[[col_name]])) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::mutate(response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE)) %>%
dplyr::count(response, name = "n") %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1),
timepoint = time_label,
label = paste0(percent, "% (n=", n, ")")
)
}
plot_ease <- function(tab, title_text, y_max = 70) {
ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
ggplot2::geom_col(width = 0.75, fill = "#2E86C1") +
ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.25, size = 3.8) +
ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(title = title_text, x = "", y = "% of enrolled respondents") +
ggplot2::theme(plot.title = ggplot2::element_text(face = "bold", size = 14))
}
ease_t1_tab <- make_ease_tab(df_enrolled, col_ease_t1, "T1")
ease_t2_tab <- make_ease_tab(df_enrolled, col_ease_t2, "T2")
ease_t1_tab
## # A tibble: 3 × 6
## response n N percent timepoint label
## <ord> <int> <int> <dbl> <chr> <chr>
## 1 Neutral 5 62 8.1 T1 8.1% (n=5)
## 2 Easy 28 62 45.2 T1 45.2% (n=28)
## 3 Very easy 29 62 46.8 T1 46.8% (n=29)
plot_ease(ease_t1_tab, paste0("Ease of Using Program Food in Meals — T1 (Enrolled, n=", unique(ease_t1_tab$N), ")"), y_max = 70)

ease_t2_tab
## # A tibble: 3 × 6
## response n N percent timepoint label
## <ord> <int> <int> <dbl> <chr> <chr>
## 1 Neutral 2 40 5 T2 5% (n=2)
## 2 Easy 21 40 52.5 T2 52.5% (n=21)
## 3 Very easy 17 40 42.5 T2 42.5% (n=17)
plot_ease(ease_t2_tab, paste0("Ease of Using Program Food in Meals — T2 (Enrolled, n=", unique(ease_t2_tab$N), ")"), y_max = 70)

# =========================
# C) WASTE (T1 + T2)
# =========================
make_waste_tab <- function(df, col_name, time_label) {
df %>%
dplyr::transmute(response = clean_yesno(.data[[col_name]])) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::mutate(response = factor(response, levels = c("No", "Yes"))) %>%
dplyr::count(response, name = "n") %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1),
timepoint = time_label,
label = paste0(percent, "%\n(n=", n, ")")
)
}
plot_waste <- function(tab, title_text) {
ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent, fill = response)) +
ggplot2::geom_col(width = 0.65) +
ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.25, size = 4) +
ggplot2::scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
ggplot2::scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(title = title_text, x = "", y = "% of enrolled respondents") +
ggplot2::theme(
legend.position = "none",
plot.title = ggplot2::element_text(face = "bold", size = 14)
)
}
waste_t1_tab <- make_waste_tab(df_enrolled, col_waste_t1, "T1")
waste_t2_tab <- make_waste_tab(df_enrolled, col_waste_t2, "T2")
waste_t1_tab
## # A tibble: 2 × 6
## response n N percent timepoint label
## <fct> <int> <int> <dbl> <chr> <chr>
## 1 No 51 63 81 T1 "81%\n(n=51)"
## 2 Yes 12 63 19 T1 "19%\n(n=12)"
plot_waste(waste_t1_tab, paste0("Threw Away Any Delivered Food? — T1 (Enrolled, n=", unique(waste_t1_tab$N), ")"))

waste_t2_tab
## # A tibble: 2 × 6
## response n N percent timepoint label
## <fct> <int> <int> <dbl> <chr> <chr>
## 1 No 37 40 92.5 T2 "92.5%\n(n=37)"
## 2 Yes 3 40 7.5 T2 "7.5%\n(n=3)"
plot_waste(waste_t2_tab, paste0("Threw Away Any Delivered Food? — T2 (Enrolled, n=", unique(waste_t2_tab$N), ")"))

# ============================================================
# EDUCATION DATA: SELF-MANAGEMENT SURVEY (T1) — ENROLLED ONLY
# Verifies the 4 headline % tiles + confidence table + control pie
# Uses existing objects: df_status, is_nonblank(), clean_confidence(), clean_impact()
# df_status is NOT modified.
# ============================================================
# ---- Enrolled only (new object; df_status untouched) ----
df_enrolled <- df_status %>% dplyr::filter(enrolled_group == "Enrolled")
# =========================
# 0) Column names (from your script)
# =========================
col_t1_manage <- "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
col_help_choices <- "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
col_conf_food <- "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
col_conf_labels <- "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
col_conf_prepare <- "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
col_help_control <- "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
need_cols <- c(col_t1_manage, col_help_choices, col_conf_food, col_conf_labels, col_conf_prepare, col_help_control)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) stop("Missing columns:\n - ", paste(missing_cols, collapse = "\n - "))
# =========================
# 1) Tile helper: compute % for a condition among VALID responses
# =========================
tile_pct <- function(df, col, clean_fun, keep_levels = NULL, predicate_fun) {
d <- df %>%
dplyr::transmute(resp = clean_fun(.data[[col]])) %>%
dplyr::filter(!is.na(resp))
if (!is.null(keep_levels)) d <- d %>% dplyr::filter(resp %in% keep_levels)
N <- nrow(d)
if (N == 0) return(tibble::tibble(n = 0, N = 0, pct = NA_real_))
n <- sum(predicate_fun(d$resp), na.rm = TRUE)
tibble::tibble(n = n, N = N, pct = round(100 * n / N, 1))
}
# =========================
# 2) TILE 1 — "take daily self-mgmt actions"
# You categorized daily in your earlier chunk via strings like "DAILY..."
# We'll re-use that same approach with a light cleaner here.
# =========================
clean_manage <- function(x) {
raw_chr <- as.character(x)
raw_std <- stringr::str_to_upper(stringr::str_squish(raw_chr))
missing_tokens <- c(
"", "NA", "N/A", "NULL", "NONE",
"NOT SURVEYED", "NOT ASKED", "NOT APPLICABLE",
"SKIPPED", "MISSING", "UNKNOWN", "-", ".", "--"
)
dplyr::case_when(
is.na(raw_chr) ~ NA_character_,
raw_std %in% missing_tokens ~ NA_character_,
stringr::str_detect(raw_std, "^DAILY") ~ "Daily",
raw_std %in% c("Y", "YES") ~ "Yes (unspecified)",
raw_std %in% c("N", "NO") ~ "No",
stringr::str_detect(raw_std, "OCCASION") ~ "Occasionally",
TRUE ~ "Other / unclear"
)
}
tile_daily <- tile_pct(
df_enrolled,
col_t1_manage,
clean_manage,
keep_levels = c("Daily","Yes (unspecified)","Occasionally","No","Other / unclear"),
predicate_fun = function(r) r == "Daily"
)
# =========================
# 3) TILE 2 — "say nutrition material helped food choices"
# Cleaner: clean_impact() already maps to Yes/No for yes/no items.
# Count % == "Yes"
# =========================
tile_helped_choices <- tile_pct(
df_enrolled,
col_help_choices,
clean_impact,
keep_levels = c("Yes","No"),
predicate_fun = function(r) r == "Yes"
)
# =========================
# 4) TILE 3 — "confident in food choices for A1C"
# Tile is "Very confident + Confident" (per your slide)
# =========================
tile_conf_food <- tile_pct(
df_enrolled,
col_conf_food,
clean_confidence,
keep_levels = c("Not confident","Somewhat not confident","Somewhat confident","Confident","Very confident","Maybe"),
predicate_fun = function(r) r %in% c("Very confident","Confident")
)
# =========================
# 5) TILE 4 — "confident preparing healthy meals"
# "Very confident + Confident"
# =========================
tile_conf_prepare <- tile_pct(
df_enrolled,
col_conf_prepare,
clean_confidence,
keep_levels = c("Not confident","Somewhat not confident","Somewhat confident","Confident","Very confident","Maybe"),
predicate_fun = function(r) r %in% c("Very confident","Confident")
)
# =========================
# 6) Print tile verification table (matches slide tiles)
# =========================
tile_verify <- dplyr::bind_rows(
tile_daily %>% dplyr::mutate(Metric = "Daily self-mgmt actions"),
tile_helped_choices %>% dplyr::mutate(Metric = "Nutrition material helped food choices (Yes)"),
tile_conf_food %>% dplyr::mutate(Metric = "Confident in food choices for A1C (Very+Confident)"),
tile_conf_prepare %>% dplyr::mutate(Metric = "Confident preparing healthy meals (Very+Confident)")
) %>%
dplyr::select(Metric, n, N, pct) %>%
dplyr::mutate(label = paste0(pct, "% (", n, "/", N, ")"))
tile_verify
## # A tibble: 4 × 5
## Metric n N pct label
## <chr> <int> <int> <dbl> <chr>
## 1 Daily self-mgmt actions 22 40 55 55% (22/…
## 2 Nutrition material helped food choices (Yes) 57 61 93.4 93.4% (5…
## 3 Confident in food choices for A1C (Very+Confident) 46 59 78 78% (46/…
## 4 Confident preparing healthy meals (Very+Confident) 51 56 91.1 91.1% (5…
# =========================
# 7) Confidence table (the one on the slide)
# Rows: Food choices / Labels / Preparing meals
# Columns: Very confident, Confident, Combined
# Denominator: respondents to that question (enrolled only)
# =========================
make_conf_row <- function(df, col, domain_label) {
d <- df %>%
dplyr::transmute(resp = clean_confidence(.data[[col]])) %>%
dplyr::filter(!is.na(resp))
N <- nrow(d)
vc <- sum(d$resp == "Very confident")
c <- sum(d$resp == "Confident")
tibble::tibble(
Domain = domain_label,
N = N,
`Very Confident` = round(100 * vc / N, 1),
`Confident` = round(100 * c / N, 1),
`Combined` = round(100 * (vc + c) / N, 1)
)
}
conf_table <- dplyr::bind_rows(
make_conf_row(df_enrolled, col_conf_food, "Food choices to control A1C"),
make_conf_row(df_enrolled, col_conf_labels, "Understanding food labels"),
make_conf_row(df_enrolled, col_conf_prepare, "Preparing healthy meals at home")
)
conf_table
## # A tibble: 3 × 5
## Domain N `Very Confident` Confident Combined
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Food choices to control A1C 64 56.2 15.6 71.9
## 2 Understanding food labels 64 57.8 17.2 75
## 3 Preparing healthy meals at home 61 63.9 19.7 83.6
# =========================
# 8) Program helped feel more in control (impact distribution)
# Slide shows: % "A lot" | % "Somewhat" | % "A little"
# We'll compute distribution among valid responses (enrolled only)
# =========================
control_tab <- df_enrolled %>%
dplyr::transmute(resp = clean_impact(.data[[col_help_control]])) %>%
dplyr::filter(!is.na(resp), resp %in% c("A lot","Somewhat","A little","Not at all")) %>%
dplyr::count(resp, name = "n") %>%
dplyr::mutate(
N = sum(n),
pct = round(100 * n / N, 1)
) %>%
dplyr::arrange(dplyr::desc(pct))
control_tab
## # A tibble: 3 × 4
## resp n N pct
## <chr> <int> <int> <dbl>
## 1 A lot 39 63 61.9
## 2 Somewhat 13 63 20.6
## 3 A little 11 63 17.5
# Optional: single-line string like your slide footer
control_footer <- control_tab %>%
dplyr::filter(resp %in% c("A lot","Somewhat","A little")) %>%
dplyr::mutate(txt = paste0(pct, "% \"", resp, "\"")) %>%
dplyr::pull(txt)
paste(control_footer, collapse = " | ")
## [1] "61.9% \"A lot\" | 20.6% \"Somewhat\" | 17.5% \"A little\""
df_enrolled %>%
filter(!is.na(.data[[col_conf_prepare]])) %>%
summarise(N = n())
## # A tibble: 1 × 1
## N
## <int>
## 1 68
df_enrolled %>%
count(.data[[col_conf_prepare]])
## # A tibble: 18 × 2
## t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_wit…¹ n
## <chr> <int>
## 1 1 1
## 2 2 2
## 3 3 9
## 4 4 14
## 5 5 2
## 6 COFIDENT 1
## 7 CONFIDENT 2
## 8 N/A 3
## 9 NEUTRAL 1
## 10 Neutral 4
## 11 SOMEWHAT CONFIDENT 1
## 12 Somewhat confident 1
## 13 VERY CONFIDENT 3
## 14 Very Confident 1
## 15 Very confident 17
## 16 n/a 4
## 17 very confident 2
## 18 <NA> 68
## # ℹ abbreviated name:
## # ¹t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home
# ============================================================
# PRIMARY OUTCOMES: A1C SLIDE — VERIFY ALL NUMBERS
# Uses your existing objects: df_status, enrolled_group, cohort_final,
# and your existing helpers: parse_delta(), is_nonblank()
# DOES NOT overwrite df_status
# ============================================================
# ---- Columns used on the slide (already in your script) ----
a1c_t1_col <- "a1c_change_t1" # change for: Cohort 1 + Cohort 2 (PDSA2 window)
a1c_t2_col <- "a1c_change_t2" # change for: Cohort 1 only (most recent window)
# ---- Settings that match the slide tiles ----
thr <- 1.0 # ≥1.0% change threshold
# ---- Safety checks ----
stopifnot("enrolled_group" %in% names(df_status))
stopifnot("cohort_final" %in% names(df_status))
stopifnot(a1c_t1_col %in% names(df_status))
stopifnot(a1c_t2_col %in% names(df_status))
# ============================================================
# 0) Helper: classify A1C change exactly like the slide
# (Improved = delta <= -1, Worsened = delta >= +1, else No change)
# ============================================================
classify_a1c <- function(delta, thr = 1.0) {
dplyr::case_when(
is.na(delta) ~ NA_character_,
delta <= -thr ~ "Improved",
delta >= thr ~ "Worsened",
TRUE ~ "No change"
)
}
# ============================================================
# 1) A1C CHANGE @ T1 TILE
# Slide: "A1C Change at T1: Cohort 1 + Cohort 2 (n=__)"
# We use ENROLLED ONLY, cohort 1 or 2, and non-missing numeric deltas
# ============================================================
a1c_t1_df <- df_status %>%
dplyr::filter(enrolled_group == "Enrolled",
cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
dplyr::transmute(
cohort_final,
delta = parse_delta(.data[[a1c_t1_col]]),
bucket = classify_a1c(parse_delta(.data[[a1c_t1_col]]), thr)
) %>%
dplyr::filter(!is.na(delta), !is.na(bucket))
a1c_t1_tile <- a1c_t1_df %>%
dplyr::count(bucket, name = "n") %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1)
) %>%
dplyr::arrange(factor(bucket, levels = c("Improved","No change","Worsened")))
a1c_t1_tile
## # A tibble: 3 × 4
## bucket n N percent
## <chr> <int> <int> <dbl>
## 1 Improved 32 51 62.7
## 2 No change 17 51 33.3
## 3 Worsened 2 51 3.9
# This table should match the 3 tile numbers on the slide:
# Improved %, No change %, Worsened %, and N (denominator)
# ============================================================
# 2) A1C CHANGE @ T2 TILE
# Slide: "A1C Change at T2: Cohort 1 (n=__)"
# ENROLLED ONLY, Cohort 1, and non-missing numeric deltas
# ============================================================
a1c_t2_df <- df_status %>%
dplyr::filter(enrolled_group == "Enrolled",
cohort_final == "Cohort 1") %>%
dplyr::transmute(
delta = parse_delta(.data[[a1c_t2_col]]),
bucket = classify_a1c(parse_delta(.data[[a1c_t2_col]]), thr)
) %>%
dplyr::filter(!is.na(delta), !is.na(bucket))
a1c_t2_tile <- a1c_t2_df %>%
dplyr::count(bucket, name = "n") %>%
dplyr::mutate(
N = sum(n),
percent = round(100 * n / N, 1)
) %>%
dplyr::arrange(factor(bucket, levels = c("Improved","No change","Worsened")))
a1c_t2_tile
## # A tibble: 3 × 4
## bucket n N percent
## <chr> <int> <int> <dbl>
## 1 Improved 19 30 63.3
## 2 No change 9 30 30
## 3 Worsened 2 30 6.7
# This table should match the Cohort 1 T2 tile numbers (e.g., 63.3 / 30.0 / 6.7 and N=30)
# ============================================================
# 3) COHORT-LEVEL SUMMARY TABLE (the big table on the slide)
# Rows: Cohort 1 / Cohort 2 / Cohort 3
# Columns used on slide:
# N (total enrolled in cohort)
# A1c Improved (T1): count/denom (%)
# A1c Improved (T2): count/denom (%) [Cohort 1 only]
# Chart review completion line at bottom:
# Cohort 1: 19/60 (31.7%) Cohort 2: 23/41 (56.1%)
# IMPORTANT: The slide’s A1c Improved cells appear to be using
# cohort N as denominator (e.g., 29/60), NOT the number with A1C deltas.
# The code below computes BOTH denominators so you can see which matches.
# ============================================================
# ---- cohort N (enrolled total) ----
cohort_N <- df_status %>%
dplyr::filter(enrolled_group == "Enrolled") %>%
dplyr::count(cohort_final, name = "N_enrolled") %>%
dplyr::filter(cohort_final %in% c("Cohort 1","Cohort 2","Cohort 3"))
# ---- A1C improved counts (numerator) + delta-available denominator ----
a1c_t1_by_cohort <- df_status %>%
dplyr::filter(enrolled_group == "Enrolled",
cohort_final %in% c("Cohort 1","Cohort 2","Cohort 3")) %>%
dplyr::transmute(
cohort_final,
delta_t1 = parse_delta(.data[[a1c_t1_col]]),
bucket_t1 = classify_a1c(parse_delta(.data[[a1c_t1_col]]), thr)
) %>%
dplyr::filter(!is.na(delta_t1), !is.na(bucket_t1)) %>%
dplyr::group_by(cohort_final) %>%
dplyr::summarise(
denom_delta_t1 = dplyr::n(),
num_improved_t1 = sum(bucket_t1 == "Improved"),
pct_improved_t1 = round(100 * num_improved_t1 / denom_delta_t1, 1),
.groups = "drop"
)
a1c_t2_by_cohort1 <- df_status %>%
dplyr::filter(enrolled_group == "Enrolled", cohort_final == "Cohort 1") %>%
dplyr::transmute(
cohort_final,
delta_t2 = parse_delta(.data[[a1c_t2_col]]),
bucket_t2 = classify_a1c(parse_delta(.data[[a1c_t2_col]]), thr)
) %>%
dplyr::filter(!is.na(delta_t2), !is.na(bucket_t2)) %>%
dplyr::summarise(
denom_delta_t2 = dplyr::n(),
num_improved_t2 = sum(bucket_t2 == "Improved"),
pct_improved_t2 = round(100 * num_improved_t2 / denom_delta_t2, 1)
)
a1c_t2_by_cohort1
## # A tibble: 1 × 3
## denom_delta_t2 num_improved_t2 pct_improved_t2
## <int> <int> <dbl>
## 1 30 19 63.3
# ---- Combine into one cohort summary with BOTH possible denominators ----
cohort_a1c_summary <- cohort_N %>%
dplyr::left_join(a1c_t1_by_cohort, by = "cohort_final") %>%
dplyr::mutate(
# if the slide is using cohort N as the denominator (e.g., 29/60)
pct_improved_t1_over_cohortN = round(100 * num_improved_t1 / N_enrolled, 1),
a1c_improved_t1_over_cohortN = paste0(num_improved_t1, "/", N_enrolled, " (", pct_improved_t1_over_cohortN, "%)"),
# if the slide is using delta-available denom (more defensible)
a1c_improved_t1_over_deltaN = paste0(num_improved_t1, "/", denom_delta_t1, " (", pct_improved_t1, "%)")
) %>%
dplyr::select(cohort_final, N_enrolled,
num_improved_t1, denom_delta_t1,
a1c_improved_t1_over_cohortN, a1c_improved_t1_over_deltaN)
cohort_a1c_summary
## # A tibble: 3 × 6
## cohort_final N_enrolled num_improved_t1 denom_delta_t1 a1c_improved_t1_over_…¹
## <chr> <int> <int> <int> <chr>
## 1 Cohort 1 60 19 28 19/60 (31.7%)
## 2 Cohort 2 41 13 23 13/41 (31.7%)
## 3 Cohort 3 35 NA NA NA/35 (NA%)
## # ℹ abbreviated name: ¹a1c_improved_t1_over_cohortN
## # ℹ 1 more variable: a1c_improved_t1_over_deltaN <chr>
# Compare a1c_improved_t1_over_cohortN vs your slide cell values.
# If your slide shows 29/60, it’s using cohort N as denominator.
# ============================================================
# 4) CHART REVIEW COMPLETION (bottom line on slide)
# Uses your existing chart-review rules:
# Cohort 1 requires: a1c_t1, ed_t1, ip_t1, a1c_t2, ed_t2, hosp_t2
# Cohort 2 requires: a1c_t1, ed_t1, ip_t1
# We compute 19/60 and 23/41 style.
# ============================================================
# Use your existing *_col vars if already defined earlier:
ed_t1_col <- "ed_utilization_t1"
ip_t1_col <- "ip_utilization_t1"
ed_t2_col <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"
stopifnot(all(c(ed_t1_col, ip_t1_col, ed_t2_col, hosp_t2_col) %in% names(df_status)))
c1_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
c2_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col)
chart_complete_flag <- function(df, req_cols) {
# TRUE if all required fields are nonblank for that row
apply(df[, req_cols, drop = FALSE], 1, function(row) all(is_nonblank(row)))
}
df_c1_enr <- df_status %>% dplyr::filter(enrolled_group == "Enrolled", cohort_final == "Cohort 1")
df_c2_enr <- df_status %>% dplyr::filter(enrolled_group == "Enrolled", cohort_final == "Cohort 2")
c1_complete_n <- sum(chart_complete_flag(df_c1_enr, c1_required))
c2_complete_n <- sum(chart_complete_flag(df_c2_enr, c2_required))
c1_total <- nrow(df_c1_enr)
c2_total <- nrow(df_c2_enr)
chart_review_line <- tibble::tibble(
Cohort = c("Cohort 1","Cohort 2"),
complete_n = c(c1_complete_n, c2_complete_n),
total_n = c(c1_total, c2_total),
percent = round(100 * complete_n / total_n, 1),
label = paste0(complete_n, "/", total_n, " (", percent, "%)")
)
chart_review_line
## # A tibble: 2 × 5
## Cohort complete_n total_n percent label
## <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 19 60 31.7 19/60 (31.7%)
## 2 Cohort 2 23 41 56.1 23/41 (56.1%)
# This should reproduce the slide line:
# Cohort 1: 19/60 (31.7%) | Cohort 2: 23/41 (56.1%)
# ============================================================
# A1C CHANGE DISTRIBUTION (T1 and/or T2)
# Percent *within each cohort* out of patients with VALID answers (non-missing delta)
# Buckets: Reduction (≤ -1.0), No change (-1.0 < delta < +1.0), Increase (≥ +1.0)
# ============================================================
thr <- 1.0
# ---- helper (use your existing parse_delta) ----
classify_a1c <- function(delta, thr = 1.0) {
dplyr::case_when(
is.na(delta) ~ NA_character_,
delta <= -thr ~ "Reduction (≥1.0% decrease)",
delta >= thr ~ "Increase (≥1.0% increase)",
TRUE ~ "No change (<1.0% change)"
)
}
# ------------------------------------------------------------
# A) A1C Change @ T1: Cohort 1 + Cohort 2 (cohort-specific %)
# ------------------------------------------------------------
a1c_t1_by_cohort <- df_status %>%
dplyr::filter(
enrolled_group == "Enrolled",
cohort_final %in% c("Cohort 1", "Cohort 2")
) %>%
dplyr::transmute(
cohort_final,
delta = parse_delta(.data[[a1c_t1_col]]),
category = classify_a1c(parse_delta(.data[[a1c_t1_col]]), thr)
) %>%
dplyr::filter(!is.na(delta), !is.na(category)) %>%
dplyr::count(cohort_final, category, name = "n") %>%
dplyr::group_by(cohort_final) %>%
dplyr::mutate(
N_valid = sum(n),
percent = round(100 * n / N_valid, 1),
label = paste0(n, "/", N_valid, " (", percent, "%)")
) %>%
dplyr::ungroup() %>%
dplyr::arrange(
cohort_final,
factor(category, levels = c("Reduction (≥1.0% decrease)", "No change (<1.0% change)", "Increase (≥1.0% increase)"))
)
a1c_t1_by_cohort
## # A tibble: 6 × 6
## cohort_final category n N_valid percent label
## <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 Reduction (≥1.0% decrease) 19 28 67.9 19/28 (67.9%)
## 2 Cohort 1 No change (<1.0% change) 8 28 28.6 8/28 (28.6%)
## 3 Cohort 1 Increase (≥1.0% increase) 1 28 3.6 1/28 (3.6%)
## 4 Cohort 2 Reduction (≥1.0% decrease) 13 23 56.5 13/23 (56.5%)
## 5 Cohort 2 No change (<1.0% change) 9 23 39.1 9/23 (39.1%)
## 6 Cohort 2 Increase (≥1.0% increase) 1 23 4.3 1/23 (4.3%)
# Optional: wide format for slide table layout
a1c_t1_by_cohort_wide <- a1c_t1_by_cohort %>%
dplyr::select(cohort_final, category, n, N_valid, percent, label) %>%
tidyr::pivot_wider(
names_from = category,
values_from = c(n, percent, label),
names_sep = " | "
)
a1c_t1_by_cohort_wide
## # A tibble: 2 × 11
## cohort_final N_valid `n | Reduction (≥1.0% decrease)` n | No change (<1.0% c…¹
## <chr> <int> <int> <int>
## 1 Cohort 1 28 19 8
## 2 Cohort 2 23 13 9
## # ℹ abbreviated name: ¹`n | No change (<1.0% change)`
## # ℹ 7 more variables: `n | Increase (≥1.0% increase)` <int>,
## # `percent | Reduction (≥1.0% decrease)` <dbl>,
## # `percent | No change (<1.0% change)` <dbl>,
## # `percent | Increase (≥1.0% increase)` <dbl>,
## # `label | Reduction (≥1.0% decrease)` <chr>,
## # `label | No change (<1.0% change)` <chr>, …
# ------------------------------------------------------------
# B) A1C Change @ T2: Cohort 1 only (same logic, valid answers)
# (If you later have Cohort 2 T2, add it to cohort_final filter.)
# ------------------------------------------------------------
a1c_t2_by_cohort <- df_status %>%
dplyr::filter(
enrolled_group == "Enrolled",
cohort_final %in% c("Cohort 1") # add "Cohort 2" if/when you have T2 deltas
) %>%
dplyr::transmute(
cohort_final,
delta = parse_delta(.data[[a1c_t2_col]]),
category = classify_a1c(parse_delta(.data[[a1c_t2_col]]), thr)
) %>%
dplyr::filter(!is.na(delta), !is.na(category)) %>%
dplyr::count(cohort_final, category, name = "n") %>%
dplyr::group_by(cohort_final) %>%
dplyr::mutate(
N_valid = sum(n),
percent = round(100 * n / N_valid, 1),
label = paste0(n, "/", N_valid, " (", percent, "%)")
) %>%
dplyr::ungroup() %>%
dplyr::arrange(
cohort_final,
factor(category, levels = c("Reduction (≥1.0% decrease)", "No change (<1.0% change)", "Increase (≥1.0% increase)"))
)
a1c_t2_by_cohort
## # A tibble: 3 × 6
## cohort_final category n N_valid percent label
## <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 Reduction (≥1.0% decrease) 19 30 63.3 19/30 (63.3%)
## 2 Cohort 1 No change (<1.0% change) 9 30 30 9/30 (30%)
## 3 Cohort 1 Increase (≥1.0% increase) 2 30 6.7 2/30 (6.7%)
# ============================================================
# A1C CHANGE DISTRIBUTION TABLE
# - Cohort 1: T1 and T2
# - Cohort 2: T1 only
# - Denominator = valid (non-missing) delta within cohort+timepoint
# ============================================================
thr <- 1.0
classify_a1c <- function(delta, thr = 1.0) {
dplyr::case_when(
is.na(delta) ~ NA_character_,
delta <= -thr ~ "Reduction (≥1.0% decrease)",
delta >= thr ~ "Increase (≥1.0% increase)",
TRUE ~ "No change (<1.0% change)"
)
}
# Build a "long" delta table with explicit timepoints
a1c_delta_long <- dplyr::bind_rows(
# ---- T1: Cohort 1 + Cohort 2 ----
df_status %>%
dplyr::filter(enrolled_group == "Enrolled",
cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
dplyr::transmute(
cohort_final,
timepoint = "T1",
delta = parse_delta(.data[[a1c_t1_col]])
),
# ---- T2: Cohort 1 only ----
df_status %>%
dplyr::filter(enrolled_group == "Enrolled",
cohort_final == "Cohort 1") %>%
dplyr::transmute(
cohort_final,
timepoint = "T2",
delta = parse_delta(.data[[a1c_t2_col]])
)
) %>%
dplyr::mutate(
category = classify_a1c(delta, thr),
category = factor(
category,
levels = c(
"Reduction (≥1.0% decrease)",
"No change (<1.0% change)",
"Increase (≥1.0% increase)"
)
),
timepoint = factor(timepoint, levels = c("T1", "T2"))
) %>%
dplyr::filter(!is.na(delta), !is.na(category))
# Final table (this is the one you want)
a1c_change_table <- a1c_delta_long %>%
dplyr::count(cohort_final, timepoint, category, name = "n") %>%
dplyr::group_by(cohort_final, timepoint) %>%
dplyr::mutate(
N_valid = sum(n),
percent = round(100 * n / N_valid, 1),
label = paste0(n, "/", N_valid, " (", percent, "%)")
) %>%
dplyr::ungroup() %>%
dplyr::arrange(cohort_final, timepoint, category)
a1c_change_table
## # A tibble: 9 × 7
## cohort_final timepoint category n N_valid percent label
## <chr> <fct> <fct> <int> <int> <dbl> <chr>
## 1 Cohort 1 T1 Reduction (≥1.0% decrease) 19 28 67.9 19/28…
## 2 Cohort 1 T1 No change (<1.0% change) 8 28 28.6 8/28 …
## 3 Cohort 1 T1 Increase (≥1.0% increase) 1 28 3.6 1/28 …
## 4 Cohort 1 T2 Reduction (≥1.0% decrease) 19 30 63.3 19/30…
## 5 Cohort 1 T2 No change (<1.0% change) 9 30 30 9/30 …
## 6 Cohort 1 T2 Increase (≥1.0% increase) 2 30 6.7 2/30 …
## 7 Cohort 2 T1 Reduction (≥1.0% decrease) 13 23 56.5 13/23…
## 8 Cohort 2 T1 No change (<1.0% change) 9 23 39.1 9/23 …
## 9 Cohort 2 T1 Increase (≥1.0% increase) 1 23 4.3 1/23 …
# ============================================================
# CHART REVIEW COMPLETION (TABLES ONLY)
# Cohort 1 + Cohort 2, ENROLLED ONLY
# Split by timepoint (T1/T2) and metric (A1c / ED / IP-Hosp)
# Does NOT overwrite df_status
# ============================================================
stopifnot(exists("df_status"))
stopifnot(all(c("enrolled_group", "cohort_final") %in% names(df_status)))
# ---- Use your known column names (edit ONLY if you ever rename in Excel) ----
a1c_t1_col <- "a1c_change_t1"
ed_t1_col <- "ed_utilization_t1"
ip_t1_col <- "ip_utilization_t1"
a1c_t2_col <- "a1c_change_t2"
ed_t2_col <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"
need_cols <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) stop("Missing chart-review columns:\n - ", paste(missing_cols, collapse="\n - "))
# ---- Nonblank logic consistent with your earlier code ----
is_nonblank <- function(x) {
x_chr <- as.character(x)
x_std <- str_to_upper(str_squish(x_chr))
!(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}
# ---- Work on a subset df (no mutation of df_status) ----
df_enrolled_c12 <- df_status %>%
filter(enrolled_group == "Enrolled", cohort_final %in% c("Cohort 1", "Cohort 2"))
# Denominators: enrolled patients in each cohort (matches slide logic)
denoms <- df_enrolled_c12 %>%
count(cohort_final, name = "N_enrolled")
# Helper: completion for one cohort + one column
completion_one <- function(cohort_name, col_name, metric_label, time_label) {
d <- df_enrolled_c12 %>% filter(cohort_final == cohort_name)
N <- nrow(d)
n_complete <- sum(is_nonblank(d[[col_name]]))
tibble(
cohort_final = cohort_name,
timepoint = time_label,
metric = metric_label,
n_complete = n_complete,
N_enrolled = N,
percent = round(100 * n_complete / N, 1),
label = paste0(n_complete, "/", N, " (", round(100 * n_complete / N, 1), "%)")
)
}
# ============================================================
# A) FIELD-LEVEL COMPLETION (what % of cohort has each field present)
# ============================================================
chart_review_completion_fields <- bind_rows(
# Cohort 1: T1 fields
completion_one("Cohort 1", a1c_t1_col, "A1c change", "T1"),
completion_one("Cohort 1", ed_t1_col, "ED utilization", "T1"),
completion_one("Cohort 1", ip_t1_col, "IP utilization", "T1"),
# Cohort 1: T2 fields
completion_one("Cohort 1", a1c_t2_col, "A1c change", "T2"),
completion_one("Cohort 1", ed_t2_col, "ED utilization", "T2"),
completion_one("Cohort 1", hosp_t2_col, "Hospital utilization","T2"),
# Cohort 2: T1 fields only (per your schema)
completion_one("Cohort 2", a1c_t1_col, "A1c change", "T1"),
completion_one("Cohort 2", ed_t1_col, "ED utilization", "T1"),
completion_one("Cohort 2", ip_t1_col, "IP utilization", "T1")
) %>%
arrange(cohort_final, timepoint, metric)
chart_review_completion_fields
## # A tibble: 9 × 7
## cohort_final timepoint metric n_complete N_enrolled percent label
## <chr> <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 T1 A1c change 29 60 48.3 29/6…
## 2 Cohort 1 T1 ED utilization 60 60 100 60/6…
## 3 Cohort 1 T1 IP utilization 60 60 100 60/6…
## 4 Cohort 1 T2 A1c change 31 60 51.7 31/6…
## 5 Cohort 1 T2 ED utilization 58 60 96.7 58/6…
## 6 Cohort 1 T2 Hospital utilizati… 59 60 98.3 59/6…
## 7 Cohort 2 T1 A1c change 23 41 56.1 23/4…
## 8 Cohort 2 T1 ED utilization 41 41 100 41/4…
## 9 Cohort 2 T1 IP utilization 41 41 100 41/4…
# ============================================================
# B) TIMEPOINT-LEVEL "CHART REVIEW COMPLETE" (all required fields present)
# - Cohort 1 T1 requires: A1c T1 + ED T1 + IP T1
# - Cohort 1 T2 requires: A1c T2 + ED T2 + Hospital T2
# - Cohort 2 T1 requires: A1c T1 + ED T1 + IP T1
# ============================================================
complete_all_required <- function(d, cols_required) {
if (nrow(d) == 0) return(0L)
sum(apply(d[, cols_required, drop = FALSE], 1, function(row) all(is_nonblank(row))))
}
chart_review_complete_timepoint <- bind_rows(
# Cohort 1 - T1 complete
{
d <- df_enrolled_c12 %>% filter(cohort_final == "Cohort 1")
n_complete <- complete_all_required(d, c(a1c_t1_col, ed_t1_col, ip_t1_col))
N <- nrow(d)
tibble(cohort_final="Cohort 1", timepoint="T1", rule="All required fields present",
n_complete=n_complete, N_enrolled=N,
percent=round(100*n_complete/N,1),
label=paste0(n_complete,"/",N," (",round(100*n_complete/N,1),"%)"))
},
# Cohort 1 - T2 complete
{
d <- df_enrolled_c12 %>% filter(cohort_final == "Cohort 1")
n_complete <- complete_all_required(d, c(a1c_t2_col, ed_t2_col, hosp_t2_col))
N <- nrow(d)
tibble(cohort_final="Cohort 1", timepoint="T2", rule="All required fields present",
n_complete=n_complete, N_enrolled=N,
percent=round(100*n_complete/N,1),
label=paste0(n_complete,"/",N," (",round(100*n_complete/N,1),"%)"))
},
# Cohort 2 - T1 complete
{
d <- df_enrolled_c12 %>% filter(cohort_final == "Cohort 2")
n_complete <- complete_all_required(d, c(a1c_t1_col, ed_t1_col, ip_t1_col))
N <- nrow(d)
tibble(cohort_final="Cohort 2", timepoint="T1", rule="All required fields present",
n_complete=n_complete, N_enrolled=N,
percent=round(100*n_complete/N,1),
label=paste0(n_complete,"/",N," (",round(100*n_complete/N,1),"%)"))
}
) %>%
arrange(cohort_final, timepoint)
chart_review_complete_timepoint
## # A tibble: 3 × 7
## cohort_final timepoint rule n_complete N_enrolled percent label
## <chr> <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 T1 All required field… 29 60 48.3 29/6…
## 2 Cohort 1 T2 All required field… 31 60 51.7 31/6…
## 3 Cohort 2 T1 All required field… 23 41 56.1 23/4…
# ============================================================
# VERIFY ED + HOSP/IP UTILIZATION SLIDE NUMBERS (TABLES ONLY)
# Uses existing df_status (does NOT overwrite it)
# Cohort-specific, ENROLLED only
# - Cohort 1: Baseline + T1 + T2 (ED + Hosp)
# - Cohort 2: Baseline + T1 (ED + IP)
# Outputs:
# 1) Valid-N + % complete per cohort/timepoint/metric
# 2) Distribution tables (0,1,2,3,4,5+ or 3+ for IP/Hosp if you want)
# 3) Change-category tables (Less / No change / More) with n + %
# ============================================================
library(dplyr)
library(stringr)
library(tidyr)
library(tibble)
stopifnot(exists("df_status"))
stopifnot(all(c("enrolled_group","cohort_final") %in% names(df_status)))
# ---- Column names from your file ----
col_ed_base <- "ed_vis_last_90_days"
col_hosp_base <- "hosp_last_90_days"
col_ed_t1 <- "ed_utilization_t1"
col_ip_t1 <- "ip_utilization_t1" # cohort 2 uses IP at T1
col_ed_t2 <- "ed_utilization_t2"
col_hosp_t2 <- "hospital_utilization_t2" # cohort 1 uses Hospital at T2
need_cols <- c(col_ed_base, col_hosp_base, col_ed_t1, col_ip_t1, col_ed_t2, col_hosp_t2)
miss <- setdiff(need_cols, names(df_status))
if (length(miss) > 0) stop("Missing columns:\n - ", paste(miss, collapse = "\n - "))
# ---- Reuse your parser if it exists; otherwise define it (safe) ----
if (!exists("parse_count")) {
parse_count <- function(x) {
s <- str_squish(as.character(x))
s[s == ""] <- NA_character_
s[str_to_upper(s) %in% c("NA","N/A","NULL")] <- NA_character_
suppressWarnings(as.numeric(str_replace_all(s, "[^0-9\\-\\.]", "")))
}
}
# ---- Nonblank (completion) ----
is_nonblank <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
!(is.na(s) | s == "" | s %in% c("NA","N/A","NULL"))
}
# ---- Enrolled + cohort filter (analysis df only; df_status untouched) ----
df_enr <- df_status %>% filter(enrolled_group == "Enrolled", cohort_final %in% c("Cohort 1","Cohort 2"))
# ============================================================
# 1) COMPLETION (valid nonblank) — EXACTLY what you need for slide footers
# ============================================================
completion_one <- function(cohort_name, timepoint, metric, col_name) {
d <- df_enr %>% filter(cohort_final == cohort_name)
N <- nrow(d)
n_valid <- sum(is_nonblank(d[[col_name]]))
tibble(
cohort_final = cohort_name,
timepoint = timepoint,
metric = metric,
n_valid = n_valid,
N_enrolled = N,
percent = round(100 * n_valid / N, 1),
label = paste0(n_valid, "/", N, " (", round(100 * n_valid / N, 1), "%)")
)
}
util_completion <- bind_rows(
# Cohort 1
completion_one("Cohort 1","Baseline","ED visits (last 90d)", col_ed_base),
completion_one("Cohort 1","Baseline","Hospital stays (last 90d)", col_hosp_base),
completion_one("Cohort 1","T1","ED utilization", col_ed_t1),
completion_one("Cohort 1","T1","IP utilization", col_ip_t1),
completion_one("Cohort 1","T2","ED utilization", col_ed_t2),
completion_one("Cohort 1","T2","Hospital utilization", col_hosp_t2),
# Cohort 2
completion_one("Cohort 2","Baseline","ED visits (last 90d)", col_ed_base),
completion_one("Cohort 2","Baseline","Hospital stays (last 90d)", col_hosp_base),
completion_one("Cohort 2","T1","ED utilization", col_ed_t1),
completion_one("Cohort 2","T1","IP utilization", col_ip_t1)
) %>% arrange(cohort_final, timepoint, metric)
util_completion
## # A tibble: 10 × 7
## cohort_final timepoint metric n_valid N_enrolled percent label
## <chr> <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 Baseline ED visits (last 90d) 60 60 100 60/6…
## 2 Cohort 1 Baseline Hospital stays (last… 60 60 100 60/6…
## 3 Cohort 1 T1 ED utilization 60 60 100 60/6…
## 4 Cohort 1 T1 IP utilization 60 60 100 60/6…
## 5 Cohort 1 T2 ED utilization 58 60 96.7 58/6…
## 6 Cohort 1 T2 Hospital utilization 59 60 98.3 59/6…
## 7 Cohort 2 Baseline ED visits (last 90d) 41 41 100 41/4…
## 8 Cohort 2 Baseline Hospital stays (last… 41 41 100 41/4…
## 9 Cohort 2 T1 ED utilization 41 41 100 41/4…
## 10 Cohort 2 T1 IP utilization 41 41 100 41/4…
# ============================================================
# 2) DISTRIBUTIONS — bucketed counts/percents (for slide bar numbers)
# ED: 0,1,2,3,4,5+
# IP/Hosp: 0,1,2,3+
# ============================================================
bucket_n <- function(x, cap) {
xi <- as.integer(x)
xi[xi < 0] <- NA_integer_
b <- ifelse(xi >= cap, cap, xi)
ifelse(is.na(b), NA_character_, ifelse(b == cap, paste0(cap, "+"), as.character(b)))
}
make_dist_tab <- function(cohort_name, timepoint, metric, col_name, cap) {
d <- df_enr %>% filter(cohort_final == cohort_name)
tab <- d %>%
transmute(val = parse_count(.data[[col_name]])) %>%
filter(!is.na(val)) %>%
transmute(bucket = bucket_n(val, cap)) %>%
count(bucket, name = "n") %>%
tidyr::complete(bucket = c(as.character(0:(cap-1)), paste0(cap, "+")), fill = list(n = 0)) %>%
mutate(
N_valid = sum(n),
percent = ifelse(N_valid > 0, round(100 * n / N_valid, 1), 0),
label = paste0(n, "/", N_valid, " (", percent, "%)"),
cohort_final = cohort_name,
timepoint = timepoint,
metric = metric
) %>%
select(cohort_final, timepoint, metric, bucket, n, N_valid, percent, label)
tab
}
# --- ED distributions (cap 5) ---
ed_dist_tables <- bind_rows(
# Cohort 1
make_dist_tab("Cohort 1","Baseline","ED visits", col_ed_base, 5),
make_dist_tab("Cohort 1","T1","ED utilization", col_ed_t1, 5),
make_dist_tab("Cohort 1","T2","ED utilization", col_ed_t2, 5),
# Cohort 2
make_dist_tab("Cohort 2","Baseline","ED visits", col_ed_base, 5),
make_dist_tab("Cohort 2","T1","ED utilization", col_ed_t1, 5)
) %>% arrange(cohort_final, timepoint, bucket)
ed_dist_tables
## # A tibble: 30 × 8
## cohort_final timepoint metric bucket n N_valid percent label
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 Baseline ED visits 0 26 60 43.3 26/60 (43…
## 2 Cohort 1 Baseline ED visits 1 18 60 30 18/60 (30…
## 3 Cohort 1 Baseline ED visits 2 11 60 18.3 11/60 (18…
## 4 Cohort 1 Baseline ED visits 3 4 60 6.7 4/60 (6.7…
## 5 Cohort 1 Baseline ED visits 4 0 60 0 0/60 (0%)
## 6 Cohort 1 Baseline ED visits 5+ 1 60 1.7 1/60 (1.7…
## 7 Cohort 1 T1 ED utilization 0 41 60 68.3 41/60 (68…
## 8 Cohort 1 T1 ED utilization 1 11 60 18.3 11/60 (18…
## 9 Cohort 1 T1 ED utilization 2 6 60 10 6/60 (10%)
## 10 Cohort 1 T1 ED utilization 3 1 60 1.7 1/60 (1.7…
## # ℹ 20 more rows
# --- IP/Hospital distributions (cap 3) ---
ip_hosp_dist_tables <- bind_rows(
# Cohort 1: baseline hosp + T1 IP + T2 hosp
make_dist_tab("Cohort 1","Baseline","Hospital stays", col_hosp_base, 3),
make_dist_tab("Cohort 1","T1","IP utilization", col_ip_t1, 3),
make_dist_tab("Cohort 1","T2","Hospital utilization", col_hosp_t2, 3),
# Cohort 2: baseline hosp + T1 IP
make_dist_tab("Cohort 2","Baseline","Hospital stays", col_hosp_base, 3),
make_dist_tab("Cohort 2","T1","IP utilization", col_ip_t1, 3)
) %>% arrange(cohort_final, timepoint, bucket)
ip_hosp_dist_tables
## # A tibble: 20 × 8
## cohort_final timepoint metric bucket n N_valid percent label
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cohort 1 Baseline Hospital stays 0 31 60 51.7 31/6…
## 2 Cohort 1 Baseline Hospital stays 1 19 60 31.7 19/6…
## 3 Cohort 1 Baseline Hospital stays 2 9 60 15 9/60…
## 4 Cohort 1 Baseline Hospital stays 3+ 1 60 1.7 1/60…
## 5 Cohort 1 T1 IP utilization 0 46 60 76.7 46/6…
## 6 Cohort 1 T1 IP utilization 1 10 60 16.7 10/6…
## 7 Cohort 1 T1 IP utilization 2 4 60 6.7 4/60…
## 8 Cohort 1 T1 IP utilization 3+ 0 60 0 0/60…
## 9 Cohort 1 T2 Hospital utilizati… 0 54 59 91.5 54/5…
## 10 Cohort 1 T2 Hospital utilizati… 1 5 59 8.5 5/59…
## 11 Cohort 1 T2 Hospital utilizati… 2 0 59 0 0/59…
## 12 Cohort 1 T2 Hospital utilizati… 3+ 0 59 0 0/59…
## 13 Cohort 2 Baseline Hospital stays 0 0 41 0 0/41…
## 14 Cohort 2 Baseline Hospital stays 1 26 41 63.4 26/4…
## 15 Cohort 2 Baseline Hospital stays 2 10 41 24.4 10/4…
## 16 Cohort 2 Baseline Hospital stays 3+ 5 41 12.2 5/41…
## 17 Cohort 2 T1 IP utilization 0 36 41 87.8 36/4…
## 18 Cohort 2 T1 IP utilization 1 5 41 12.2 5/41…
## 19 Cohort 2 T1 IP utilization 2 0 41 0 0/41…
## 20 Cohort 2 T1 IP utilization 3+ 0 41 0 0/41…
# ============================================================
# 3) CHANGE CATEGORIES — Less / No change / More
# ED: Baseline→T1 (C1 & C2), and T1→T2 (C1 only)
# IP/Hosp: Baseline→T1 uses (hosp_base→ip_t1), and T1→T2 (ip_t1→hosp_t2) for C1
# ============================================================
make_change_tab <- function(d, before_col, after_col, cohort_name, metric, change_label) {
dd <- d %>%
transmute(
before = parse_count(.data[[before_col]]),
after = parse_count(.data[[after_col]])
) %>%
filter(!is.na(before) & !is.na(after)) %>%
transmute(
category = case_when(
after < before ~ "Less",
after > before ~ "More",
TRUE ~ "No change"
)
) %>%
count(category, name = "n") %>%
mutate(
category = factor(category, levels = c("Less","No change","More")),
N_valid = sum(n),
percent = round(100 * n / N_valid, 1),
label = paste0(n, "/", N_valid, " (", percent, "%)"),
cohort_final = cohort_name,
metric = metric,
change = change_label
) %>%
arrange(category)
dd
}
d_c1 <- df_enr %>% filter(cohort_final == "Cohort 1")
d_c2 <- df_enr %>% filter(cohort_final == "Cohort 2")
util_change_tables <- bind_rows(
# ED change
make_change_tab(d_c1, col_ed_base, col_ed_t1, "Cohort 1", "ED", "Baseline → T1"),
make_change_tab(d_c2, col_ed_base, col_ed_t1, "Cohort 2", "ED", "Baseline → T1"),
make_change_tab(d_c1, col_ed_t1, col_ed_t2, "Cohort 1", "ED", "T1 → T2"),
# IP/Hosp change (note: baseline uses hosp_last_90_days; T1 uses ip_utilization_t1; T2 uses hospital_utilization_t2)
make_change_tab(d_c1, col_hosp_base, col_ip_t1, "Cohort 1", "IP/Hosp", "Baseline → T1"),
make_change_tab(d_c2, col_hosp_base, col_ip_t1, "Cohort 2", "IP/Hosp", "Baseline → T1"),
make_change_tab(d_c1, col_ip_t1, col_hosp_t2, "Cohort 1", "IP/Hosp", "T1 → T2")
) %>%
select(cohort_final, metric, change, category, n, N_valid, percent, label)
util_change_tables
## # A tibble: 17 × 8
## cohort_final metric change category n N_valid percent label
## <chr> <chr> <chr> <fct> <int> <int> <dbl> <chr>
## 1 Cohort 1 ED Baseline → T1 Less 25 60 41.7 25/60 (41…
## 2 Cohort 1 ED Baseline → T1 No change 27 60 45 27/60 (45…
## 3 Cohort 1 ED Baseline → T1 More 8 60 13.3 8/60 (13.…
## 4 Cohort 2 ED Baseline → T1 Less 33 41 80.5 33/41 (80…
## 5 Cohort 2 ED Baseline → T1 No change 7 41 17.1 7/41 (17.…
## 6 Cohort 2 ED Baseline → T1 More 1 41 2.4 1/41 (2.4…
## 7 Cohort 1 ED T1 → T2 Less 15 58 25.9 15/58 (25…
## 8 Cohort 1 ED T1 → T2 No change 30 58 51.7 30/58 (51…
## 9 Cohort 1 ED T1 → T2 More 13 58 22.4 13/58 (22…
## 10 Cohort 1 IP/Hosp Baseline → T1 Less 24 60 40 24/60 (40…
## 11 Cohort 1 IP/Hosp Baseline → T1 No change 30 60 50 30/60 (50…
## 12 Cohort 1 IP/Hosp Baseline → T1 More 6 60 10 6/60 (10%)
## 13 Cohort 2 IP/Hosp Baseline → T1 Less 38 41 92.7 38/41 (92…
## 14 Cohort 2 IP/Hosp Baseline → T1 No change 3 41 7.3 3/41 (7.3…
## 15 Cohort 1 IP/Hosp T1 → T2 Less 12 59 20.3 12/59 (20…
## 16 Cohort 1 IP/Hosp T1 → T2 No change 44 59 74.6 44/59 (74…
## 17 Cohort 1 IP/Hosp T1 → T2 More 3 59 5.1 3/59 (5.1…
# ============================================================
# PRIMARY OUTCOMES: UTILIZATION (ENROLLED ONLY, NO COHORT SPLITS)
# Verifies the slide boxes:
# ED Util Change: Baseline -> T1 (Less / No change / More) with n + %
# IP Util Change: Baseline -> T1 (Less / No change / More) with n + %
# And summary line:
# "T2 enrolled: X% had zero inpatient stays (vs Y% at T1)"
# ============================================================
library(dplyr)
library(stringr)
library(tidyr)
stopifnot(exists("df_status"))
stopifnot("enrolled_group" %in% names(df_status))
# ---- columns (from your setup) ----
col_ed_base <- "ed_vis_last_90_days"
col_ed_t1 <- "ed_utilization_t1"
col_ip_base <- "hosp_last_90_days" # baseline inpatient proxy in your sheet
col_ip_t1 <- "ip_utilization_t1"
col_ip_t2 <- "hospital_utilization_t2" # this is what your slide calls "T2 enrolled: zero inpatient stays"
need <- c(col_ed_base, col_ed_t1, col_ip_base, col_ip_t1, col_ip_t2)
miss <- setdiff(need, names(df_status))
if (length(miss) > 0) stop("Missing columns:\n - ", paste(miss, collapse = "\n - "))
# ---- reuse your parser if it exists; otherwise define once ----
if (!exists("parse_count")) {
parse_count <- function(x) {
s <- str_squish(as.character(x))
s[s == ""] <- NA_character_
s[str_to_upper(s) %in% c("NA","N/A","NULL")] <- NA_character_
suppressWarnings(as.numeric(str_replace_all(s, "[^0-9\\-\\.]", "")))
}
}
# ---- enrolled-only analysis df (does not touch df_status) ----
df_enrolled <- df_status %>% filter(enrolled_group == "Enrolled")
# ============================================================
# A) Change-table helper (Less / No change / More) + exact N
# ============================================================
make_change_boxes <- function(df, before_col, after_col) {
dd <- df %>%
transmute(
before = parse_count(.data[[before_col]]),
after = parse_count(.data[[after_col]])
) %>%
filter(!is.na(before) & !is.na(after)) %>%
mutate(
category = case_when(
after < before ~ "Less",
after > before ~ "More",
TRUE ~ "No change"
)
)
N <- nrow(dd)
out <- dd %>%
count(category, name = "n") %>%
mutate(
N = N,
percent = round(100 * n / N, 1),
label = paste0(percent, "% (n=", n, ")"),
category = factor(category, levels = c("Less","No change","More"))
) %>%
arrange(category)
list(N = N, tab = out)
}
# ---- ED boxes: Baseline -> T1 ----
ed_boxes <- make_change_boxes(df_enrolled, col_ed_base, col_ed_t1)
ed_boxes$N
## [1] 101
ed_boxes$tab
## # A tibble: 3 × 5
## category n N percent label
## <fct> <int> <int> <dbl> <chr>
## 1 Less 58 101 57.4 57.4% (n=58)
## 2 No change 34 101 33.7 33.7% (n=34)
## 3 More 9 101 8.9 8.9% (n=9)
# ---- IP boxes: Baseline -> T1 ----
ip_boxes <- make_change_boxes(df_enrolled, col_ip_base, col_ip_t1)
ip_boxes$N
## [1] 101
ip_boxes$tab
## # A tibble: 3 × 5
## category n N percent label
## <fct> <int> <int> <dbl> <chr>
## 1 Less 62 101 61.4 61.4% (n=62)
## 2 No change 33 101 32.7 32.7% (n=33)
## 3 More 6 101 5.9 5.9% (n=6)
# ============================================================
# B) "T2 enrolled: % had zero inpatient stays (vs % at T1)"
# Here, "inpatient stays at T1" = ip_utilization_t1
# and "inpatient stays at T2" = hospital_utilization_t2
# Denominators are "valid answers" (non-missing numeric parses).
# ============================================================
pct_zero <- function(df, col) {
v <- parse_count(df[[col]])
v <- v[!is.na(v)]
N <- length(v)
n0 <- sum(v == 0)
tibble(N_valid = N, n_zero = n0, percent_zero = round(100 * n0 / N, 1),
label = paste0(round(100 * n0 / N, 1), "% (", n0, "/", N, ")"))
}
zero_t1 <- pct_zero(df_enrolled, col_ip_t1)
zero_t2 <- pct_zero(df_enrolled, col_ip_t2)
zero_t1
## # A tibble: 1 × 4
## N_valid n_zero percent_zero label
## <int> <int> <dbl> <chr>
## 1 101 82 81.2 81.2% (82/101)
zero_t2
## # A tibble: 1 × 4
## N_valid n_zero percent_zero label
## <int> <int> <dbl> <chr>
## 1 59 54 91.5 91.5% (54/59)
# If you want the exact sentence values:
summary_line <- tibble(
t1 = zero_t1$label,
t2 = zero_t2$label
)
summary_line
## # A tibble: 1 × 2
## t1 t2
## <chr> <chr>
## 1 81.2% (82/101) 91.5% (54/59)