df <- read_excel("FIM Data 2-26-2026 (DI).xlsx",
col_types = "text") %>%
clean_names()
names(df)
## [1] "pre_enrollment_notes"
## [2] "cohort"
## [3] "enrolled"
## [4] "age"
## [5] "preferred_language"
## [6] "disch_date_time"
## [7] "cc"
## [8] "admission_diagnosis"
## [9] "admitting_provider"
## [10] "discharge_disposition"
## [11] "lace_readmission_score_score_column"
## [12] "unplanned_readmission_score"
## [13] "primary_cvg"
## [14] "hb_a1c_value_last_3_months"
## [15] "pt_portal_status"
## [16] "hosp_last_365_days"
## [17] "ed_last_365_days"
## [18] "ed_vis_last_90_days"
## [19] "hosp_last_90_days"
## [20] "pcp"
## [21] "race"
## [22] "ethnicity"
## [23] "legal_sex"
## [24] "sdoh_high_risk_domains"
## [25] "zip_code"
## [26] "prescreen_contacted_y_n"
## [27] "ps_in_the_past_12_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"
## [28] "ps_in_the_past_12_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
## [29] "ps_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"
## [30] "pre_screen_interested_in_program"
## [31] "t0_contact"
## [32] "screen_for_enrollment_interested_in_program"
## [33] "t0_are_you_likely_to_eat_the_food_as_part_of_your_normal_daily_routine"
## [34] "t0_in_the_past_week_how_often_did_you_check_your_blood_sugar"
## [35] "t0_are_you_confident_in_making_food_choices_that_help_control_your_blood_sugar"
## [36] "t0_are_you_confident_in_understanding_how_to_read_food_labels_and_nutrition_information"
## [37] "t0_were_your_sugar_levels_usually_in_your_target_range_last_week"
## [38] "t0_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"
## [39] "t0_since_your_discharge_from_the_hospital_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [40] "t1_contact_y_n"
## [41] "t1_notes"
## [42] "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
## [43] "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
## [44] "if_yes_why_and_how_much_food_was_thrown_away_44"
## [45] "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
## [46] "t1_in_the_past_week_how_often_did_you_check_your_blood_sugar"
## [47] "t1_were_your_sugar_levels_usually_in_your_target_range_last_week"
## [48] "t1_did_you_have_any_symptoms_of_high_or_low_blood_sugar_in_the_past_week"
## [49] "t1_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"
## [50] "do_you_have_access_to_your_medications_escalate_as_needed_50"
## [51] "t1_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [52] "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
## [53] "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
## [54] "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
## [55] "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
## [56] "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
## [57] "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
## [58] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_58"
## [59] "t1_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"
## [60] "t2_contact_y_n"
## [61] "t2_notes"
## [62] "t2_in_the_past_6_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"
## [63] "t2_in_the_past_6_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
## [64] "t2_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"
## [65] "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
## [66] "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
## [67] "if_yes_why_and_how_much_food_was_thrown_away_67"
## [68] "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
## [69] "t2_in_the_past_week_how_often_did_you_check_your_blood_sugar"
## [70] "do_you_have_access_to_your_medications_escalate_as_needed_70"
## [71] "t2_were_your_sugar_levels_usually_in_your_target_range_last_week"
## [72] "t2_in_the_past_week_did_you_have_any_symptoms_of_high_or_low_blood_sugar"
## [73] "t2_in_the_past_week_did_you_miss_any_doses_of_your_diabetes_medication"
## [74] "t2_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"
## [75] "t2_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_drinking_more_water"
## [76] "t2_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
## [77] "t2_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
## [78] "t2_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
## [79] "t2_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
## [80] "t2_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_foods_you_typically_have_at_home"
## [81] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_81"
## [82] "t2_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"
## [83] "hemoglobin_a1c_8_1_25_10_2_2025"
## [84] "hospital_utilization_8_1_25_10_2_25"
## [85] "a1c_change_8_1_25_10_2_25"
## [86] "a1c_under_8_8_1_25_10_2_25"
## [87] "hemoglobin_a1c_11_20_2_20_t1"
## [88] "ed_utilization_t1"
## [89] "ip_utilization_t1"
## [90] "a1c_change_t1"
## [91] "a1c_under_8_t1"
## [92] "additional_feedback"
## [93] "hemoglobin_a1c_t2"
## [94] "a1c_date_t2"
## [95] "ed_utilization_t2"
## [96] "hospital_utilization_t2"
## [97] "a1c_change_t2"
## [98] "a1c_under_8_t2"
## [99] "housing"
## [100] "housing_2"
## [101] "food"
## [102] "utilities"
## [103] "transport"
## [104] "finance"
## [105] "caregiver"
## [106] "legal"
## [107] "violence"
## [108] "employment"
## [109] "education"
## [110] "completed"
view(df)
# Confirm required columns exist
stopifnot("enrolled" %in% names(df))
stopifnot("cohort" %in% names(df))
df_clean <- df %>%
mutate(
enrolled_clean = str_to_upper(str_squish(as.character(enrolled))),
cohort_clean = str_squish(as.character(cohort)),
cohort_num = suppressWarnings(as.integer(str_extract(cohort_clean, "\\d+")))
)
df_status <- df_clean %>%
mutate(
enrolled_group = case_when(
enrolled_clean == "Y" ~ "Enrolled",
TRUE ~ "Not enrolled"
),
cohort_final = case_when(
enrolled_group == "Enrolled" & cohort_num == 1 ~ "Cohort 1",
enrolled_group == "Enrolled" & cohort_num == 2 ~ "Cohort 2",
enrolled_group == "Enrolled" & cohort_num == 3 ~ "Cohort 3",
TRUE ~ "Not enrolled"
)
)
cohort_counts <- df_status %>%
count(cohort_final, name = "N") %>%
arrange(cohort_final)
cohort_counts
## # A tibble: 4 × 2
## cohort_final N
## <chr> <int>
## 1 Cohort 1 60
## 2 Cohort 2 41
## 3 Cohort 3 35
## 4 Not enrolled 817
a1c_t1_col <- "a1c_change_t1"
ed_t1_col <- "ed_utilization_t1"
ip_t1_col <- "ip_utilization_t1"
a1c_t2_col <- "a1c_change_t2"
ed_t2_col <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"
# ----------------------------
# 7) Safety checks: chart-review columns exist
# ----------------------------
chart_cols <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
missing_cols <- setdiff(chart_cols, names(df_status))
if (length(missing_cols) > 0) {
stop(
"These chart-review columns are missing from df_status:\n - ",
paste(missing_cols, collapse = "\n - "),
"\n\nRun: names(df_status) and update the *_col variables to match."
)
}
# ----------------------------
# 8) Cohort subsets
# ----------------------------
df_c1 <- df_status %>% filter(cohort_final == "Cohort 1")
df_c2 <- df_status %>% filter(cohort_final == "Cohort 2")
df_c3 <- df_status %>% filter(cohort_final == "Cohort 3")
# ----------------------------
# 9) Completion helpers
# - "nonblank" means not NA and not "" and not "NA"/"N/A"/"NULL"
# ----------------------------
is_nonblank <- function(x) {
x_chr <- as.character(x)
x_std <- str_to_upper(str_squish(x_chr))
!(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}
pct_complete <- function(df, column) {
total <- nrow(df)
if (total == 0) return("-")
complete_n <- sum(is_nonblank(df[[column]]))
pct <- round(100 * complete_n / total, 1)
paste0(complete_n, "/", total, " (", pct, "%)")
}
pct_complete_all_required <- function(df, cols_required) {
total <- nrow(df)
if (total == 0) return("-")
complete_n <- sum(apply(df[, cols_required, drop = FALSE], 1, function(row) {
all(is_nonblank(row))
}))
pct <- round(100 * complete_n / total, 1)
paste0(complete_n, "/", total, " (", pct, "%)")
}
# ----------------------------
# 10) Basic cohort size table
# ----------------------------
cohort_size_table <- tibble(
Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3))
)
cohort_size_table
## # A tibble: 3 × 2
## Cohort N
## <chr> <int>
## 1 Cohort 1 60
## 2 Cohort 2 41
## 3 Cohort 3 35
# ----------------------------
# CHART REVIEW
# ----------------------------
c1_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
c2_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col)
chart_review_table <- tibble(
Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3)),
# Field-level completeness
`A1c Change (T1)` = c(pct_complete(df_c1, a1c_t1_col), pct_complete(df_c2, a1c_t1_col), "-"),
`ED Utilization (T1)` = c(pct_complete(df_c1, ed_t1_col), pct_complete(df_c2, ed_t1_col), "-"),
`IP Utilization (T1)` = c(pct_complete(df_c1, ip_t1_col), pct_complete(df_c2, ip_t1_col), "-"),
`A1c Change (T2)` = c(pct_complete(df_c1, a1c_t2_col), "-", "-"),
`ED Utilization (T2)` = c(pct_complete(df_c1, ed_t2_col), "-", "-"),
`Hospital Utilization (T2)` = c(pct_complete(df_c1, hosp_t2_col), "-", "-"),
# Cohort-level "complete chart review"
`Chart review complete (per cohort rules)` = c(
pct_complete_all_required(df_c1, c1_required),
pct_complete_all_required(df_c2, c2_required),
"-"
)
)
chart_review_table
## # A tibble: 3 × 9
## Cohort N `A1c Change (T1)` `ED Utilization (T1)` `IP Utilization (T1)`
## <chr> <int> <chr> <chr> <chr>
## 1 Cohort 1 60 29/60 (48.3%) 60/60 (100%) 60/60 (100%)
## 2 Cohort 2 41 23/41 (56.1%) 41/41 (100%) 41/41 (100%)
## 3 Cohort 3 35 - - -
## # ℹ 4 more variables: `A1c Change (T2)` <chr>, `ED Utilization (T2)` <chr>,
## # `Hospital Utilization (T2)` <chr>,
## # `Chart review complete (per cohort rules)` <chr>
# -------------------------
# A1c Change (T1): Cohort 1 + Cohort 2
# -------------------------
a1c_t1 <- df_status %>%
filter(cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
transmute(
delta = suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(.data[[a1c_t1_col]])), "[^0-9.\\-]", "")))
) %>%
filter(!is.na(delta)) %>%
mutate(
Category = case_when(
delta <= -1.0 ~ "Improved (≥1.0% ↓)",
delta >= 1.0 ~ "Worsened (≥1.0% ↑)",
TRUE ~ "No meaningful change (<1.0%)"
),
Category = factor(Category, levels = c("Improved (≥1.0% ↓)", "No meaningful change (<1.0%)", "Worsened (≥1.0% ↑)"))
)
n_t1 <- nrow(a1c_t1)
a1c_t1_tab <- a1c_t1 %>%
count(Category, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(pct, "%\n(n=", n, ")")
)
# ---- small helper: parse numeric safely ----
parse_delta <- function(x) {
suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(x)), "[^0-9.\\-]", "")))
}
# ---- style settings (you asked for color) ----
col_improved <- "#2E7D32" # green
col_nochange <- "#757575" # gray
col_worsened <- "#C62828" # red
make_a1c_plots <- function(df_in, delta_col, title_text) {
d <- df_in %>%
transmute(delta = parse_delta(.data[[delta_col]])) %>%
filter(!is.na(delta)) %>%
mutate(
Category = case_when(
delta <= -1.0 ~ "Improved (≥1.0%)",
delta >= 1.0 ~ "Worsened (≥1.0%)",
TRUE ~ "No meaningful change"
),
Category = factor(Category, levels = c("Improved (≥1.0%)", "No meaningful change", "Worsened (≥1.0%)"))
)
n <- nrow(d)
tab <- d %>%
count(Category, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(pct, "% (n=", n, ")")
)
# 1) Colored category bar chart
p_bar <- ggplot(tab, aes(x = Category, y = pct, fill = Category)) +
geom_col(width = 0.72) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_fill_manual(values = c(
"Improved (≥1.0%)" = col_improved,
"No meaningful change" = col_nochange,
"Worsened (≥1.0%)" = col_worsened
)) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
labs(title = paste0(title_text, " (n = ", n, ")"), x = NULL, y = "% of cohort") +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(size = 12),
legend.position = "none"
)
# 2) Replace histogram with: jitter + boxplot + threshold bands/lines
p_dist <- ggplot(d, aes(x = "", y = delta)) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -1, ymax = 1, alpha = 0.10, fill = col_nochange) +
geom_hline(yintercept = c(-1, 1), linetype = "dashed", linewidth = 0.7, color = "black") +
geom_boxplot(width = 0.22, outlier.shape = NA, alpha = 0.35) +
geom_jitter(width = 0.12, height = 0, alpha = 0.55, size = 1.6) +
coord_flip() +
labs(
title = "A1c change distribution (each dot = 1 patient)",
x = NULL,
y = "A1c change (percentage points)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
list(tab = tab, bar = p_bar, dist = p_dist)
}
# =========================
# T1: Cohort 1 + Cohort 2
# =========================
df_a1c_t1 <- df_status %>% filter(cohort_final %in% c("Cohort 1", "Cohort 2"))
out_t1 <- make_a1c_plots(df_a1c_t1, a1c_t1_col, "A1C Change (T1): Cohort 1 + Cohort 2")
out_t1$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 32 62.7 62.7% (n=32)
## 2 No meaningful change 17 33.3 33.3% (n=17)
## 3 Worsened (≥1.0%) 2 3.9 3.9% (n=2)
out_t1$bar

# =========================
# T2: Cohort 1 only
# =========================
df_a1c_t2 <- df_status %>% filter(cohort_final == "Cohort 1")
out_t2 <- make_a1c_plots(df_a1c_t2, a1c_t2_col, "A1C Change (T2): Cohort 1")
out_t2$tab
## # A tibble: 3 × 4
## Category n pct label
## <fct> <int> <dbl> <chr>
## 1 Improved (≥1.0%) 19 63.3 63.3% (n=19)
## 2 No meaningful change 9 30 30% (n=9)
## 3 Worsened (≥1.0%) 2 6.7 6.7% (n=2)
out_t2$bar

# -------------------------
# Column names you specified
# -------------------------
col_a1c_all <- "hb_a1c_value_last_3_months"
col_a1c_t1 <- "hemoglobin_a1c_11_20_2_20_t1"
col_a1c_t2 <- "hemoglobin_a1c_t2"
need_cols <- c(col_a1c_all, col_a1c_t1, col_a1c_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ", paste(missing, collapse = "\n - "),
"\n\nCheck names(df_status) and fix the col_a1c_* variables above.")
}
# -------------------------
# Clean HbA1c values → numeric
# -------------------------
clean_a1c <- function(x) {
x <- str_squish(as.character(x))
x[x %in% c("", "NA", "N/A", "na", "n/a", "NULL", "null")] <- NA_character_
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
x <- gsub("%", "", x)
suppressWarnings(as.numeric(x))
}
# -------------------------
# Build long dataset (3 panels)
# -------------------------
a1c_long <- df_status %>%
transmute(
id = row_number(),
A1c_All = clean_a1c(.data[[col_a1c_all]]),
A1c_T1 = clean_a1c(.data[[col_a1c_t1]]),
A1c_T2 = clean_a1c(.data[[col_a1c_t2]])
) %>%
pivot_longer(cols = c(A1c_All, A1c_T1, A1c_T2),
names_to = "Timepoint",
values_to = "HbA1c") %>%
filter(!is.na(HbA1c)) %>%
mutate(
Timepoint = recode(Timepoint,
A1c_All = "All patients (chart review HbA1c)",
A1c_T1 = "T1 HbA1c",
A1c_T2 = "T2 HbA1c"),
Timepoint = factor(Timepoint,
levels = c("All patients (chart review HbA1c)", "T1 HbA1c", "T2 HbA1c")),
Hb_bin = floor(HbA1c),
Hb_mid = Hb_bin + 0.5
)
# -------------------------
# Summarize into 1.0%-wide bins and label percentages
# -------------------------
a1c_hist <- a1c_long %>%
count(Timepoint, Hb_bin, Hb_mid, name = "n") %>%
group_by(Timepoint) %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = ifelse(pct > 0, paste0(pct, "%"), ""),
N = sum(n)
) %>%
ungroup()
# For facet labels with n
facet_labs <- setNames(
paste0(levels(a1c_hist$Timepoint), " (n = ", tapply(a1c_long$HbA1c, a1c_long$Timepoint, length), ")"),
levels(a1c_hist$Timepoint)
)
# Determine x-range
min_bin <- min(a1c_hist$Hb_bin, na.rm = TRUE)
max_bin <- max(a1c_hist$Hb_bin, na.rm = TRUE)
# -------------------------
# Plot (3 stacked panels)
# -------------------------
ggplot(a1c_hist, aes(x = Hb_mid, y = pct)) +
geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
geom_text(aes(label = label), vjust = -0.35, size = 3.4) +
facet_wrap(
~ Timepoint,
ncol = 1,
labeller = labeller(Timepoint = facet_labs)
) +
scale_x_continuous(
breaks = seq(min_bin, max_bin, by = 1),
minor_breaks = NULL
) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
coord_cartesian(ylim = c(0, max(a1c_hist$pct, na.rm = TRUE) + 10)) +
theme_bw() +
labs(
title = "HbA1c Distributions (All vs T1 vs T2)",
x = "HbA1c (%)",
y = "% within timepoint"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold", size = 12),
axis.text = element_text(size = 11)
)

# -------------------------
# 1) COLUMN NAMES (EDIT THESE 5 to match names(df_status))
# -------------------------
col_ed_base <- "ed_vis_last_90_days"
col_ed_t1 <- "ed_utilization_t1"
col_ed_t2 <- "ed_utilization_t2"
col_ip_base <- "hosp_last_90_days" # <-- replace with your true IP baseline column (NOT hosp)
col_ip_t1 <- "ip_utilization_t1"
col_ip_t2 <- "hospital_utilization_t2"
# -------------------------
# 2) Helpers
# -------------------------
parse_count <- function(x) {
x <- as.character(x)
x <- str_squish(x)
x[x == ""] <- NA_character_
x[str_to_upper(x) %in% c("NA","N/A","NULL")] <- NA_character_
suppressWarnings(as.numeric(str_replace_all(x, "[^0-9\\-\\.]", "")))
}
make_bucket <- function(x, cap = 5) {
bx <- ifelse(x >= cap, cap, as.integer(x))
ifelse(bx == cap, paste0(cap, "+"), as.character(bx))
}
# -------------------------
# 3) Safety checks
# -------------------------
need_cols <- c(col_ed_base, col_ed_t1, col_ed_t2,
col_ip_base, col_ip_t1, col_ip_t2,
"enrolled_group")
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ", paste(missing, collapse = "\n - "),
"\n\nFix the col_* names above using names(df_status).")
}
# -------------------------
# 4) Colors (simple + clear)
# -------------------------
col_less <- "#2E7D32" # green
col_same <- "#757575" # gray
col_more <- "#C62828" # red
col_zero <- "#546E7A" # blue-gray
col_onep <- "#8E24AA" # purple
make_dist_df <- function(df_status, base_col, t1_col, t2_col, cap = 5, domain_label = "ED") {
d <- df_status %>%
transmute(
group = if_else(enrolled_group == "Enrolled", "Enrolled", "All patients"),
base = parse_count(.data[[base_col]]),
t1 = parse_count(.data[[t1_col]]),
t2 = parse_count(.data[[t2_col]])
) %>%
pivot_longer(cols = c(base, t1, t2), names_to = "timepoint", values_to = "count") %>%
filter(!is.na(count)) %>%
mutate(
timepoint = recode(timepoint,
base = "Baseline (last 90d)",
t1 = "T1",
t2 = "T2"),
timepoint = factor(timepoint, levels = c("Baseline (last 90d)", "T1", "T2")),
bucket = make_bucket(count, cap = cap),
bucket = factor(bucket, levels = c(as.character(0:(cap-1)), paste0(cap, "+"))),
group = factor(group, levels = c("All patients", "Enrolled")),
domain = domain_label
)
d %>%
count(domain, group, timepoint, bucket, name = "n") %>%
tidyr::complete(domain, group, timepoint, bucket, fill = list(n = 0)) %>% # ensures missing buckets show as 0
group_by(domain, group, timepoint) %>%
mutate(
denom = sum(n),
pct = ifelse(denom > 0, round(100 * n / denom, 1), 0),
label = ifelse(n > 0, paste0(pct, "%"), "")
) %>%
ungroup()
}
plot_dist <- function(tab, title_text, xlab) {
ggplot(tab, aes(x = bucket, y = pct)) +
geom_col(width = 0.8, fill = "#2E6F9E") +
geom_text(aes(label = label), vjust = -0.25, size = 3.6) +
facet_grid(group ~ timepoint) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(title = title_text, x = xlab, y = "% of patients within group") +
theme(
plot.title = element_text(face = "bold", size = 14),
strip.background = element_rect(fill = "grey90"),
strip.text = element_text(face = "bold")
)
}
# -------------------------
# 5) ED distribution plot (cap at 5+)
# -------------------------
ed_dist <- make_dist_df(df_status, col_ed_base, col_ed_t1, col_ed_t2, cap = 5, domain_label = "ED")
plot_dist(ed_dist, "ED Utilization Distribution: All patients vs Enrolled", "ED visits")

# -------------------------
# 6) IP distribution plot (cap at 3+)
# -------------------------
ip_dist <- make_dist_df(df_status, col_ip_base, col_ip_t1, col_ip_t2, cap = 3, domain_label = "IP")
plot_dist(ip_dist, "Inpatient Utilization Distribution: All patients vs Enrolled", "Inpatient stays")

# =========================
# 7) CHANGE PLOTS: Baseline → T1 (Enrolled only)
# X-axis: Less visits / No change / More visits
# =========================
# Build enrolled paired cohort (must have baseline + T1 for each domain)
df_util_pairs <- df_status %>%
filter(enrolled_group == "Enrolled") %>%
transmute(
ED_base = parse_count(.data[[col_ed_base]]),
ED_T1 = parse_count(.data[[col_ed_t1]]),
IP_base = parse_count(.data[[col_ip_base]]),
IP_T1 = parse_count(.data[[col_ip_t1]])
) %>%
filter(!is.na(ED_base) & !is.na(ED_T1),
!is.na(IP_base) & !is.na(IP_T1))
n_pairs <- nrow(df_util_pairs)
make_change_tab <- function(df, base_var, t1_var) {
df %>%
transmute(
Change = case_when(
.data[[t1_var]] < .data[[base_var]] ~ "Less visits",
.data[[t1_var]] > .data[[base_var]] ~ "More visits",
TRUE ~ "No change"
)
) %>%
count(Change, name = "n") %>%
mutate(
pct = round(100 * n / sum(n), 1),
label = paste0(pct, "%\n(n=", n, ")"),
Change = factor(Change, levels = c("Less visits", "No change", "More visits"))
)
}
plot_change_bar <- function(tab, title_text) {
ggplot(tab, aes(x = Change, y = pct, fill = Change)) +
geom_col(width = 0.72) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_fill_manual(values = c(
"Less visits" = col_less,
"No change" = col_same,
"More visits" = col_more
)) +
scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
labs(
title = title_text,
x = NULL,
y = "% of enrolled cohort"
) +
theme_bw() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "none"
)
}
# ED change
ed_change_tab <- make_change_tab(df_util_pairs, "ED_base", "ED_T1")
p_ed_change <- plot_change_bar(
ed_change_tab,
paste0("ED Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)
# IP change
ip_change_tab <- make_change_tab(df_util_pairs, "IP_base", "IP_T1")
p_ip_change <- plot_change_bar(
ip_change_tab,
paste0("IP Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)
# show tables if you want
ed_change_tab
## # A tibble: 3 × 4
## Change n pct label
## <fct> <int> <dbl> <chr>
## 1 Less visits 58 57.4 "57.4%\n(n=58)"
## 2 More visits 9 8.9 "8.9%\n(n=9)"
## 3 No change 34 33.7 "33.7%\n(n=34)"
ip_change_tab
## # A tibble: 3 × 4
## Change n pct label
## <fct> <int> <dbl> <chr>
## 1 Less visits 62 61.4 "61.4%\n(n=62)"
## 2 More visits 6 5.9 "5.9%\n(n=6)"
## 3 No change 33 32.7 "32.7%\n(n=33)"
# stacked like your screenshot
p_ed_change / p_ip_change

# =========================
# DEMOGRAPHIC TABLE
# Columns (left->right):
# Category, Value, All Patients, Not enrolled, Enrolled, Cohort 1, Cohort 2, Cohort 3
# Combines multi-select responses that appear in different order
# =========================
# ---- A) Identify (or set) the demographic source columns ----
# If your columns are named differently, change ONLY these two lines.
race_col <- "race"
eth_col <- "ethnicity"
stopifnot(race_col %in% names(df_status))
stopifnot(eth_col %in% names(df_status))
# ---- B) Canonicalize multi-select answers (newline/;/, order-insensitive) ----
canon_multiselect <- function(x) {
x <- as.character(x)
x <- str_trim(x)
x[x == ""] <- NA_character_
sapply(x, function(val) {
if (is.na(val)) return(NA_character_)
parts <- unlist(str_split(val, "\\r?\\n|\\s*[;,]\\s*"))
parts <- str_trim(parts)
parts <- parts[parts != ""]
if (length(parts) == 0) return(NA_character_)
paste(sort(unique(parts)), collapse = "; ")
}, USE.NAMES = FALSE)
}
# ---- C) Map Race codes to labels (keeps multi-select combined) ----
race_map <- c(
"R1" = "American Indian / Alaska Native",
"R2" = "Asian",
"R3" = "Black or African-American",
"R4" = "Native Hawaiian / Pacific Islander",
"R5" = "White",
"R9" = "Other",
"S1" = "Patient Declined",
"S2" = "Patient Unavailable",
"S3" = "Not Applicable / Unknown"
)
clean_race_value <- function(x_vec) {
x_vec <- canon_multiselect(x_vec)
sapply(x_vec, function(s) {
if (is.na(s)) return(NA_character_)
items <- str_split(s, ";\\s*")[[1]]
items <- str_trim(items)
recoded <- vapply(items, function(it) {
code <- str_extract(it, "^[A-Za-z]\\d+")
if (!is.na(code) && code %in% names(race_map)) race_map[[code]] else it
}, character(1))
paste(sort(unique(recoded)), collapse = "; ")
}, USE.NAMES = FALSE)
}
# ---- D) Map Ethnicity codes to labels (keeps multi-select combined if present) ----
clean_eth_value <- function(x_vec) {
x_vec <- canon_multiselect(x_vec)
sapply(x_vec, function(s) {
if (is.na(s)) return(NA_character_)
items <- str_split(s, ";\\s*")[[1]]
items <- str_trim(items)
recoded <- dplyr::case_when(
str_detect(items, "E1") ~ "Spanish / Hispanic / Latino",
str_detect(items, "E2") ~ "Not Spanish / Hispanic / Latino",
str_detect(items, "S1") ~ "Patient Declined",
str_detect(items, "S2") ~ "Patient Unavailable",
TRUE ~ NA_character_
)
recoded <- recoded[!is.na(recoded)]
if (length(recoded) == 0) return(NA_character_)
paste(sort(unique(recoded)), collapse = "; ")
}, USE.NAMES = FALSE)
}
# ---- E) Add cleaned demographic fields to df_status ----
df_status <- df_status %>%
mutate(
race_clean = clean_race_value(.data[[race_col]]),
ethnicity_clean = clean_eth_value(.data[[eth_col]])
)
# ---- F) Formatter: n (percent) with denominator within each column/group ----
fmt_np <- function(n, denom) {
if (is.na(n) || denom == 0) return("-")
paste0(n, " (", sprintf("%.1f", 100 * n / denom), "%)")
}
# ---- G) Generic demo table builder for one variable ----
make_demo_table <- function(data, variable, label) {
d <- data %>%
mutate(Value = canon_multiselect(.data[[variable]])) %>% # canonicalize again (safe)
filter(!is.na(Value))
# denominators (only among rows that answered this variable)
denom_all <- nrow(d)
denom_not <- nrow(filter(d, enrolled_group == "Not enrolled"))
denom_enr <- nrow(filter(d, enrolled_group == "Enrolled"))
denom_c1 <- nrow(filter(d, cohort_final == "Cohort 1"))
denom_c2 <- nrow(filter(d, cohort_final == "Cohort 2"))
denom_c3 <- nrow(filter(d, cohort_final == "Cohort 3"))
# counts
all_ct <- d %>% count(Value, name = "n_all")
not_ct <- d %>% filter(enrolled_group == "Not enrolled") %>% count(Value, name = "n_not")
enr_ct <- d %>% filter(enrolled_group == "Enrolled") %>% count(Value, name = "n_enr")
c1_ct <- d %>% filter(cohort_final == "Cohort 1") %>% count(Value, name = "n_c1")
c2_ct <- d %>% filter(cohort_final == "Cohort 2") %>% count(Value, name = "n_c2")
c3_ct <- d %>% filter(cohort_final == "Cohort 3") %>% count(Value, name = "n_c3")
out <- all_ct %>%
full_join(not_ct, by = "Value") %>%
full_join(enr_ct, by = "Value") %>%
full_join(c1_ct, by = "Value") %>%
full_join(c2_ct, by = "Value") %>%
full_join(c3_ct, by = "Value") %>%
mutate(
Category = label,
`All Patients` = mapply(fmt_np, n_all, denom_all),
`Not enrolled` = mapply(fmt_np, n_not, denom_not),
`Enrolled` = mapply(fmt_np, n_enr, denom_enr),
`Cohort 1` = mapply(fmt_np, n_c1, denom_c1),
`Cohort 2` = mapply(fmt_np, n_c2, denom_c2),
`Cohort 3` = mapply(fmt_np, n_c3, denom_c3)
) %>%
select(Category, Value, `All Patients`, `Not enrolled`, `Enrolled`, `Cohort 1`, `Cohort 2`, `Cohort 3`) %>%
arrange(Category, Value)
out
}
# ---- H) Build tables and combine ----
race_table <- make_demo_table(df_status, "race_clean", "Race")
ethnicity_table <- make_demo_table(df_status, "ethnicity_clean", "Ethnicity")
demographic_table <- bind_rows(race_table, ethnicity_table)
demographic_table
## # A tibble: 23 × 8
## Category Value `All Patients` `Not enrolled` Enrolled `Cohort 1` `Cohort 2`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Race Americ… 4 (0.4%) 3 (0.4%) 1 (0.7%) 1 (1.7%) -
## 2 Race Asian 29 (3.0%) 27 (3.3%) 2 (1.5%) - -
## 3 Race Asian;… 1 (0.1%) 1 (0.1%) - - -
## 4 Race Asian;… 1 (0.1%) 1 (0.1%) - - -
## 5 Race Black … 234 (24.6%) 180 (22.0%) 54 (39.… 23 (38.3%) 14 (34.1%)
## 6 Race Black … 1 (0.1%) - 1 (0.7%) 1 (1.7%) -
## 7 Race Black … 12 (1.3%) 11 (1.3%) 1 (0.7%) 1 (1.7%) -
## 8 Race Black … 3 (0.3%) 2 (0.2%) 1 (0.7%) - 1 (2.4%)
## 9 Race Black … 2 (0.2%) 2 (0.2%) - - -
## 10 Race Native… 2 (0.2%) 2 (0.2%) - - -
## # ℹ 13 more rows
## # ℹ 1 more variable: `Cohort 3` <chr>
# -------------------------
# 0) COLUMN NAMES (edit if needed)
# -------------------------
race_col <- "race"
sex_col <- "legal_sex"
eth_col <- "ethnicity"
needed_demo_cols <- c("enrolled_group", race_col, sex_col, eth_col)
missing_demo_cols <- setdiff(needed_demo_cols, names(df_status))
if (length(missing_demo_cols) > 0) {
stop(
"Missing demographic columns:\n - ",
paste(missing_demo_cols, collapse = "\n - ")
)
}
# -------------------------
# Helper for legend labels
# -------------------------
make_legend_labels <- function(df_long) {
sizes <- df_long %>%
distinct(id, Group) %>%
count(Group, name = "N")
setNames(
paste0(sizes$Group, " (n = ", sizes$N, ")"),
sizes$Group
)
}
# =========================
# 1) RACE
# =========================
race_map <- c(
"R1" = "American Indian / Alaska Native",
"R2" = "Asian",
"R3" = "Black or African American",
"R4" = "Native Hawaiian / Pacific Islander",
"R5" = "White",
"R9" = "Other"
)
race_long <- df_status %>%
transmute(
id = row_number(),
enrolled_group,
Race_raw = str_squish(as.character(.data[[race_col]]))
) %>%
filter(!is.na(Race_raw), Race_raw != "") %>%
mutate(Race_raw = str_split(Race_raw, "\\r?\\n")) %>%
unnest(Race_raw) %>%
mutate(
Race_code = str_extract(Race_raw, "^R\\d+"),
Race_clean = unname(race_map[Race_code])
) %>%
filter(!is.na(Race_clean))
race_tab <- bind_rows(
race_long %>% mutate(Group = "All patients"),
race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
count(Group, Race_clean, name = "n") %>%
group_by(Group) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup()
race_tab <- race_tab %>%
mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
race_legend_labels <- make_legend_labels(
bind_rows(
race_long %>% mutate(Group = "All patients"),
race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
)
)
ggplot(race_tab, aes(x = Race_clean, y = percent, fill = Group)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(percent, "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3.6) +
coord_flip(clip = "off") +
scale_fill_manual(values = c("All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"),
labels = race_legend_labels) +
expand_limits(y = max(race_tab$percent, na.rm = TRUE) + 8) +
theme_bw() +
labs(title = "Race Distribution: Enrolled vs All Patients",
x = NULL,
y = "% within group",
fill = "") +
theme(plot.title = element_text(face = "bold"),
legend.position = "top")

# =========================
# 2) LEGAL SEX
# =========================
sex_long <- df_status %>%
transmute(
id = row_number(),
enrolled_group,
Sex = str_squish(as.character(.data[[sex_col]]))
) %>%
filter(!is.na(Sex), Sex != "")
sex_tab <- bind_rows(
sex_long %>% mutate(Group = "All patients"),
sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
count(Group, Sex, name = "n") %>%
group_by(Group) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
sex_legend_labels <- make_legend_labels(
bind_rows(
sex_long %>% mutate(Group = "All patients"),
sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
)
)
ggplot(sex_tab, aes(x = Sex, y = percent, fill = Group)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(percent, "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3.6) +
coord_flip(clip = "off") +
scale_fill_manual(values = c("All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"),
labels = sex_legend_labels) +
expand_limits(y = max(sex_tab$percent, na.rm = TRUE) + 8) +
theme_bw() +
labs(title = "Legal Sex: Enrolled vs All Patients",
x = NULL,
y = "% within group",
fill = "") +
theme(plot.title = element_text(face = "bold"),
legend.position = "top")

# =========================
# 3) ETHNICITY
# =========================
eth_long <- df_status %>%
transmute(
id = row_number(),
enrolled_group,
Eth_raw = str_squish(as.character(.data[[eth_col]])),
Eth_clean = case_when(
str_detect(Eth_raw, "E1") ~ "Spanish/Hispanic/Latino",
str_detect(Eth_raw, "E2") ~ "Not Spanish/Hispanic/Latino",
str_detect(Eth_raw, "S1") ~ "Patient declined",
str_detect(Eth_raw, "S2") ~ "Patient unavailable",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(Eth_clean), Eth_clean != "")
eth_tab <- bind_rows(
eth_long %>% mutate(Group = "All patients"),
eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
count(Group, Eth_clean, name = "n") %>%
group_by(Group) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
eth_legend_labels <- make_legend_labels(
bind_rows(
eth_long %>% mutate(Group = "All patients"),
eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
)
)
ggplot(eth_tab, aes(x = Eth_clean, y = percent, fill = Group)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(percent, "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3.6) +
coord_flip(clip = "off") +
scale_fill_manual(values = c("All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"),
labels = eth_legend_labels) +
expand_limits(y = max(eth_tab$percent, na.rm = TRUE) + 8) +
theme_bw() +
labs(title = "Ethnicity: Enrolled vs All Patients",
x = NULL,
y = "% within group",
fill = "") +
theme(plot.title = element_text(face = "bold"),
legend.position = "top")

# =========================
# T1 SELF-MANAGEMENT (PAST WEEK)
# =========================
col_t1_manage <- "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
# ---- Safety check ----
if (!col_t1_manage %in% names(df_status)) {
stop("Column not found:\n ", col_t1_manage,
"\n\nCheck names(df_status) and update col_t1_manage.")
}
# ---- 1) Clean + categorize responses ----
# ---- 1) Clean + categorize responses (FIXED: blanks/not surveyed -> NA) ----
missing_tokens <- c(
"", "NA", "N/A", "NULL", "NONE",
"NOT SURVEYED", "NOT ASKED", "NOT APPLICABLE",
"SKIPPED", "MISSING", "UNKNOWN"
)
df_manage <- df_status %>%
transmute(
raw = .data[[col_t1_manage]],
raw_chr = as.character(raw),
raw_std = str_to_upper(str_squish(raw_chr)),
category = case_when(
# TRUE missing (actual NA)
is.na(raw_chr) ~ NA_character_,
# empty/whitespace after squish
raw_std == "" ~ NA_character_,
# common placeholders meaning "not surveyed" / missing
raw_std %in% missing_tokens ~ NA_character_,
# if your system uses "-" or "." as blank, include them:
raw_std %in% c("-", ".", "--") ~ NA_character_,
# actual responses
raw_std %in% c("N", "NO") ~ "No",
raw_std %in% c("Y", "YES") ~ "Yes (unspecified)",
str_detect(raw_std, "^DAILY") ~ "Daily",
str_detect(raw_std, "OCCASION") ~ "Occasionally",
TRUE ~ "Other / unclear"
)
)
# ---- 2) Summary table ----
manage_tab <- df_manage %>%
filter(!is.na(category)) %>%
count(category, name = "n") %>%
mutate(
percent = round(100 * n / sum(n), 1)
) %>%
arrange(desc(n))
manage_tab
## # A tibble: 4 × 3
## category n percent
## <chr> <int> <dbl>
## 1 Daily 22 55
## 2 Occasionally 12 30
## 3 Yes (unspecified) 5 12.5
## 4 No 1 2.5
# ---- 3) Bar chart ----
ggplot(manage_tab, aes(x = reorder(category, percent), y = percent)) +
geom_col(width = 0.7, fill = "#4E79A7") +
geom_text(
aes(label = paste0(percent, "% (n=", n, ")")),
hjust = -0.05,
size = 3.6
) +
coord_flip(clip = "off") +
expand_limits(y = max(manage_tab$percent, na.rm = TRUE) + 8) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = "T1: Diabetes Self-Management Actions (Past Week)",
x = "",
y = "% of respondents"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11)
)

col_help_choices <- "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
col_conf_food <- "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
col_conf_labels <- "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
col_conf_prepare <- "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
col_help_control <- "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"
# CLEAN + STANDARDIZE LIKERT RESPONSES
# =========================
# =========================
# CLEANERS
# =========================
# A) Confidence-style questions (maps numeric + misspellings + MAYBE)
clean_confidence <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric scales (support both 1-4 and 1-5)
s %in% c("1") ~ "Not confident",
s %in% c("2") ~ "Somewhat not confident",
s %in% c("3") ~ "Confident",
s %in% c("4") ~ "Very confident",
s %in% c("5") ~ "Very confident", # if a stray 5 appears, bucket to Very confident
# explicit text + common typos
str_detect(s, "NOT CONFIDENT") ~ "Not confident",
str_detect(s, "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
str_detect(s, "\\bMAYBE\\b") ~ "Maybe",
s %in% c("CONFIDENT", "COFIDENT", "CONFIDEMT", "CONFIDNT", "COFIDENT") ~ "Confident",
str_detect(s, "VERY CONFIDENT") ~ "Very confident",
str_detect(s, "SOMEWHAT CONFIDENT") ~ "Somewhat confident",
TRUE ~ "Other / unclear"
)
}
# B) Program-help / impact-style questions (A lot / Somewhat / A little, etc.)
clean_impact <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric (if any)
s %in% c("5") ~ "A lot",
s %in% c("4") ~ "Somewhat",
s %in% c("3") ~ "A little",
s %in% c("2") ~ "A little",
s %in% c("1") ~ "Not at all",
# text variants (case + spacing)
str_detect(s, "^A LOT") ~ "A lot",
str_detect(s, "^SOMEWHAT") ~ "Somewhat",
str_detect(s, "A LITTLE") ~ "A little",
str_detect(s, "NOT AT ALL") ~ "Not at all",
# yes/no (if it sneaks in)
s %in% c("YES", "Y") ~ "Yes",
s %in% c("NO", "N") ~ "No",
TRUE ~ "Other / unclear"
)
}
plot_t1_question <- function(df, col_name, title_text, cleaner_fun) {
if (!col_name %in% names(df)) stop("Column not found: ", col_name)
df_plot <- df %>%
transmute(response = cleaner_fun(.data[[col_name]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
arrange(percent)
ggplot(df_plot, aes(x = reorder(response, percent), y = percent)) +
geom_col(fill = "#1F4E79", width = 0.7) +
geom_text(aes(label = paste0(percent, "% (n=", n, ")")),
hjust = -0.05, size = 3.6) +
coord_flip(clip = "off") +
expand_limits(y = max(df_plot$percent, na.rm = TRUE) + 8) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(title = title_text, x = "", y = "% of respondents") +
theme(plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11))
}
p1 <- plot_t1_question(df_status, col_help_choices,
"T1: Nutrition Material Helped Make Healthier Food Choices",
cleaner_fun = clean_impact)
p2 <- plot_t1_question(df_status, col_conf_food,
"T1: Confidence in Making Food Choices to Control A1C",
cleaner_fun = clean_confidence)
p3 <- plot_t1_question(df_status, col_conf_labels,
"T1: Confidence Understanding Food Labels / Nutrition Info",
cleaner_fun = clean_confidence)
p4 <- plot_t1_question(df_status, col_conf_prepare,
"T1: Confidence Preparing Healthy Meals at Home",
cleaner_fun = clean_confidence)
print(p1); print(p2); print(p3); print(p4)




impact_tab <- df_status %>%
transmute(response = clean_impact(.data[[col_help_control]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(response, "\n", percent, "% (n=", n, ")")
)
ggplot(impact_tab, aes(x = "", y = n, fill = response)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(
aes(label = paste0(percent, "%")),
position = position_stack(vjust = 0.5),
size = 4,
color = "white"
) +
scale_fill_manual(
values = c(
"A lot" = "#1F4E79",
"Somewhat" = "#2E86C1",
"A little" = "#5DADE2",
"Yes" = "#A9CCE3"
)
) +
theme_void() +
labs(
title = "T1: Program Helped Patient Feel More in Control of Diabetes",
fill = ""
) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "right"
)

# =========================
# MEALS MADE USING PROGRAM FOOD — T1 + T2
# ---- EDIT if needed to match names(df_status) ----
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
# ---- safety check ----
need_cols <- c(col_meals_t1, col_meals_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ", paste(missing, collapse = "\n - "),
"\n\nFix col_meals_t1 / col_meals_t2 using names(df_status).")
}
# ---- robust cleaner (works for BOTH T1 + T2) ----
clean_meals <- function(x) {
s <- str_to_lower(str_squish(as.character(x)))
# convert text numbers to digits
s <- str_replace_all(s, c(
"zero" = "0",
"one" = "1",
"two" = "2",
"three" = "3",
"four" = "4",
"five" = "5",
"six" = "6",
"seven" = "7",
"eight" = "8"
))
case_when(
# missing
is.na(s) ~ NA_character_,
s %in% c("", "na", "n/a", "null", "unsure of #", "unsure") ~ NA_character_,
# ran out = 0
str_detect(s, "ran out") ~ "0",
# explicit 0 / 1
str_detect(s, "^0$") ~ "0",
str_detect(s, "^1$") ~ "1",
# 2–3 bucket
str_detect(s, "2-3") | str_detect(s, "2–3") | str_detect(s, "2 to 3") |
str_detect(s, "^2$") | str_detect(s, "^3$") |
str_detect(s, "2 a day") | str_detect(s, "3 a day") ~ "2–3",
# 4–6 bucket
str_detect(s, "4-6") | str_detect(s, "4 to 6") |
str_detect(s, "4 to 5") |
str_detect(s, "3-4") | str_detect(s, "3 to 4") |
str_detect(s, "^4$") | str_detect(s, "^5$") | str_detect(s, "^6$") |
str_detect(s, "5-6") ~ "4–6",
# 7+
str_detect(s, "7") | str_detect(s, "8") ~ "7+",
TRUE ~ NA_character_
)
}
# ---- plotting helper ----
plot_meals <- function(df, col_name, title_text, y_max = 60) {
tab <- df %>%
transmute(meals = clean_meals(.data[[col_name]])) %>%
filter(!is.na(meals)) %>%
count(meals, name = "n") %>%
mutate(
meals = factor(meals, levels = c("0", "1", "2–3", "4–6", "7+")),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "% (n=", n, ")")
) %>%
arrange(meals)
ggplot(tab, aes(x = meals, y = percent)) +
geom_col(fill = "#1F4E79", width = 0.7) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_y_continuous(
limits = c(0, y_max), # <-- FIXED Y LIMIT
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = title_text,
x = "Number of meals",
y = "% of respondents"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 11)
)
}
p_meals_t1 <- plot_meals(df_status, col_meals_t1,
"Meals Prepared Using Program Food (Past Week) — T1",
y_max = 60)
p_meals_t2 <- plot_meals(df_status, col_meals_t2,
"Meals Prepared Using Program Food (Past Week) — T2",
y_max = 60)
print(p_meals_t1)

print(p_meals_t2)

# =========================
# EASE OF USING PROGRAM FOOD IN MEALS (T1)
# =========================
# ---- EDIT to match names(df_status) if needed ----
col_ease_t1 <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2 <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
if (!col_ease_t1 %in% names(df_status)) {
stop("Column not found: ", col_ease_t1,
"\n\nFind the right one with: names(df_status)[str_detect(names(df_status), 'easy')]")
}
# ---- cleaner ----
clean_ease <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
# numeric scale variants (adjust if your coding differs)
s %in% c("1") ~ "Very easy",
s %in% c("2") ~ "Easy",
s %in% c("3") ~ "Neutral",
# text variants
str_detect(s, "VERY EASY") ~ "Very easy",
s == "EASY" ~ "Easy",
str_detect(s, "NEUTRAL") ~ "Neutral",
TRUE ~ NA_character_
)
}
ease_summary <- df_status %>%
transmute(response = clean_ease(.data[[col_ease_t1]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(
response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "% (n=", n, ")")
) %>%
arrange(response)
ease_summary_t2 <- df_status %>%
transmute(response = clean_ease(.data[[col_ease_t2]])) %>%
filter(!is.na(response)) %>%
count(response, name = "n") %>%
mutate(
response = factor(response,
levels = c("Neutral", "Easy", "Very easy"),
ordered = TRUE),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "% (n=", n, ")")
) %>%
arrange(response)
ggplot(ease_summary, aes(x = response, y = percent)) +
geom_col(fill = "#2E86C1", width = 0.7) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.12))) +
theme_bw() +
labs(
title = "Ease of Using Program Food in Meals (T1)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

ggplot(ease_summary_t2, aes(x = response, y = percent)) +
geom_col(fill = "#2E86C1", width = 0.7) +
geom_text(aes(label = label), vjust = -0.35, size = 4) +
scale_y_continuous(
limits = c(0, 60), # adjust if needed to match T1
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Ease of Using Program Food in Meals (T2)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

# 1. Identify the T1 waste column explicitly
col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
# ---- Safety check ----
need_cols <- c(col_waste_t1, col_waste_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
stop("Missing columns:\n - ",
paste(missing, collapse = "\n - "),
"\n\nRun names(df_status) and correct the column names above.")
}
# ---- Clean Yes / No properly (ignore blanks & non-surveyed) ----
clean_yesno <- function(x) {
s <- str_to_upper(str_squish(as.character(x)))
case_when(
is.na(s) ~ NA_character_,
s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
s %in% c("YES", "Y") ~ "Yes",
s %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
}
# ---- Function to build summary + plot ----
plot_waste <- function(df, col_name, title_text) {
tab <- df %>%
transmute(Response = clean_yesno(.data[[col_name]])) %>%
filter(!is.na(Response)) %>%
count(Response, name = "n") %>%
mutate(
Response = factor(Response, levels = c("No", "Yes")),
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%\n(n=", n, ")")
)
ggplot(tab, aes(x = Response, y = percent, fill = Response)) +
geom_col(width = 0.6) +
geom_text(aes(label = label), vjust = -0.3, size = 4) +
scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
scale_y_continuous(limits = c(0, 100),
labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = title_text,
x = "",
y = "% of respondents"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)
}
# ---- Draw both graphs ----
p_waste_t1 <- plot_waste(df_status, col_waste_t1,
"Did Patients Throw Away Delivered Food? (T1)")
p_waste_t2 <- plot_waste(df_status, col_waste_t2,
"Did Patients Throw Away Delivered Food? (T2)")
print(p_waste_t1)

print(p_waste_t2)
