library(readxl)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(patchwork)
## ---- SETUP_for_PDSA_summary ---------------------------------------------
# 1. Load updated dataset (DF2)
df <- read_excel("DF2.xlsx", col_types = "text")
# 2. Standard NA cleanup for survey-style fields
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")
survey_cols <- grep("^T[01]-", names(df), value = TRUE)
df <- df %>%
mutate(
across(
all_of(survey_cols),
~ {
x <- trimws(as.character(.x))
x[x %in% na_codes] <- NA_character_
x
}
)
)
# 3. Identify T0 and T1 question columns (used everywhere later)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)
# 4. Sanity check (optional, safe to keep)
length(t0_q_cols)
## [1] 7
length(t1_q_cols)
## [1] 15
## ---- PDSA_summary_clean_v2 -----------------------------------------------
# ---- Cohort rules (per your update) ----
T0_MIN <- 2 # "more than 1" T0 answer
T1_MIN_C1 <- 3 # "more than 2" T1 answers
TARGET_COHORT1 <- 60
TARGET_COHORT2 <- 60
# ---- Define cohorts ONCE (early) ----
df_status <- df %>%
dplyr::mutate(
T0_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t0_q_cols)))),
T1_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t1_q_cols))))
) %>%
dplyr::mutate(
Cohort = dplyr::case_when(
T0_answer_count >= T0_MIN & T1_answer_count >= T1_MIN_C1 ~ "Cohort 1",
T0_answer_count >= T0_MIN ~ "Cohort 2",
TRUE ~ "Not enrolled"
),
Enrolled = if_else(Cohort %in% c("Cohort 1", "Cohort 2"), "Enrolled", "Not enrolled")
)
df_COHORT1 <- df_status %>% dplyr::filter(Cohort == "Cohort 1")
df_COHORT2 <- df_status %>% dplyr::filter(Cohort == "Cohort 2")
df_enrolled <- df_status %>% dplyr::filter(Enrolled == "Enrolled")
# ---- A1c columns in DF2 (confirmed) ----
a1c_base_col <- "HbA1c Value (Last 3 Months)"
a1c_t1_col <- "Hemoglobin A1c (T1)"
a1c_change_col <- "A1c Change (T1)" # use if present; else compute
# ---- Safe numeric cleaner for A1c text ----
clean_a1c <- function(x) {
x <- trimws(as.character(x))
x[x %in% na_codes] <- NA_character_
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
x <- gsub("%", "", x)
suppressWarnings(as.numeric(x))
}
# ---- Build A1c dataset (no mutate-if_else bug) ----
df_a1c <- df_enrolled %>%
dplyr::mutate(
A1c_baseline = clean_a1c(.data[[a1c_base_col]]),
A1c_T1 = clean_a1c(.data[[a1c_t1_col]])
)
if (a1c_change_col %in% names(df_enrolled)) {
df_a1c <- df_a1c %>% dplyr::mutate(A1c_change = clean_a1c(.data[[a1c_change_col]]))
} else {
df_a1c <- df_a1c %>% dplyr::mutate(A1c_change = A1c_T1 - A1c_baseline)
}
# ---- PDSA thresholds (±1.0%) ----
df_a1c <- df_a1c %>%
dplyr::mutate(
A1c_change_cat = dplyr::case_when(
!is.na(A1c_change) & A1c_change <= -1.0 ~ "Improved (≥1.0% reduction)",
!is.na(A1c_change) & A1c_change >= 1.0 ~ "Worsened (≥1.0% increase)",
!is.na(A1c_change) ~ "No meaningful change",
TRUE ~ NA_character_
)
)
# ---- Counts (actuals) ----
n_total <- nrow(df_status)
n_c1 <- nrow(df_COHORT1)
n_c2 <- nrow(df_COHORT2)
n_enrolled <- nrow(df_enrolled)
# Survey completion (relevant: T0 + T1 only)
n_T0_ge2 <- sum(df_status$T0_answer_count >= T0_MIN)
n_T1_ge1 <- sum(df_status$T1_answer_count >= 1)
# A1c availability among enrolled
n_base_a1c <- sum(!is.na(df_a1c$A1c_baseline))
n_t1_a1c <- sum(!is.na(df_a1c$A1c_T1))
n_change_a1c <- sum(!is.na(df_a1c$A1c_change))
# ---- Change distribution among those WITH A1c change ----
chg_tab <- df_a1c %>%
dplyr::filter(!is.na(A1c_change_cat)) %>%
dplyr::count(A1c_change_cat, name = "Count") %>%
dplyr::mutate(
Denominator = sum(Count),
Percent = round(100 * Count / Denominator, 1)
)
# ---- PDSA-ready summary table (targets + shortfalls + key %s) ----
pdsa_summary <- tibble::tibble(
Metric = c(
"Total people in dataset",
paste0("Cohort 1 (T0≥", T0_MIN, " AND T1≥", T1_MIN_C1, ") — actual"),
"Cohort 1 — target",
"Cohort 1 — shortfall",
paste0("Cohort 2 (T0≥", T0_MIN, " AND T1<", T1_MIN_C1, ") — actual"),
"Cohort 2 — target",
"Cohort 2 — shortfall",
"Enrolled (Cohort 1 + Cohort 2) — actual",
"Among enrolled: baseline HbA1c recorded",
"Among enrolled: T1 HbA1c recorded",
"Among enrolled: A1c Change (T1) available"
),
Count = c(
n_total,
n_c1,
TARGET_COHORT1,
TARGET_COHORT1 - n_c1,
n_c2,
TARGET_COHORT2,
TARGET_COHORT2 - n_c2,
n_enrolled,
n_base_a1c,
n_t1_a1c,
n_change_a1c
),
Denominator = c(
n_total,
TARGET_COHORT1,
TARGET_COHORT1,
TARGET_COHORT1,
TARGET_COHORT2,
TARGET_COHORT2,
TARGET_COHORT2,
TARGET_COHORT1 + TARGET_COHORT2,
n_enrolled,
n_enrolled,
n_enrolled
)
) %>%
dplyr::mutate(Percent = round(100 * Count / Denominator, 1))
pdsa_summary
## # A tibble: 11 × 4
## Metric Count Denominator Percent
## <chr> <dbl> <dbl> <dbl>
## 1 Total people in dataset 489 489 100
## 2 Cohort 1 (T0≥2 AND T1≥3) — actual 38 60 63.3
## 3 Cohort 1 — target 60 60 100
## 4 Cohort 1 — shortfall 22 60 36.7
## 5 Cohort 2 (T0≥2 AND T1<3) — actual 23 60 38.3
## 6 Cohort 2 — target 60 60 100
## 7 Cohort 2 — shortfall 37 60 61.7
## 8 Enrolled (Cohort 1 + Cohort 2) — actual 61 120 50.8
## 9 Among enrolled: baseline HbA1c recorded 61 61 100
## 10 Among enrolled: T1 HbA1c recorded 29 61 47.5
## 11 Among enrolled: A1c Change (T1) available 29 61 47.5
chg_tab
## # A tibble: 3 × 4
## A1c_change_cat Count Denominator Percent
## <chr> <int> <int> <dbl>
## 1 Improved (≥1.0% reduction) 19 29 65.5
## 2 No meaningful change 9 29 31
## 3 Worsened (≥1.0% increase) 1 29 3.4
df_counts <- df %>%
mutate(
T0_answer_count = rowSums(!is.na(select(., all_of(t0_q_cols)))),
T1_answer_count = rowSums(!is.na(select(., all_of(t1_q_cols))))
)
# -----------------------------
# 2. Define cohorts (FINAL)
# -----------------------------
df_counts <- df_counts %>%
mutate(
Cohort = case_when(
T0_answer_count >= 2 & T1_answer_count >= 3 ~ "Cohort 1 (Completed T1)",
T0_answer_count >= 2 & T1_answer_count <= 2 ~ "Cohort 2 (No / partial T1)",
TRUE ~ "Not enrolled"
)
)
# -----------------------------
# 3. Core denominators
# -----------------------------
anticipated_per_cohort <- 60
n_total <- nrow(df_counts)
n_cohort1 <- sum(df_counts$Cohort == "Cohort 1 (Completed T1)")
n_cohort2 <- sum(df_counts$Cohort == "Cohort 2 (No / partial T1)")
n_enrolled <- n_cohort1 + n_cohort2
# -----------------------------
# 4. A1c availability (baseline + T1)
# -----------------------------
a1c_base_col <- "HbA1c Value (Last 3 Months)"
a1c_t1_col <- "Hemoglobin A1c (T1)"
df_enrolled <- df_counts %>%
filter(Cohort %in% c("Cohort 1 (Completed T1)", "Cohort 2 (No / partial T1)"))
n_base_a1c <- sum(!is.na(df_enrolled[[a1c_base_col]]) &
trimws(df_enrolled[[a1c_base_col]]) != "")
df_cohort1 <- df_counts %>%
filter(Cohort == "Cohort 1 (Completed T1)")
n_t1_a1c <- sum(!is.na(df_cohort1[[a1c_t1_col]]) &
trimws(df_cohort1[[a1c_t1_col]]) != "")
# -----------------------------
# 5. Final PDSA-aligned summary table
# -----------------------------
pdsa_summary <- tibble(
Metric = c(
"Anticipated enrollment per cohort",
"Observed Cohort 1 enrollment (completed T1)",
"Observed Cohort 2 enrollment (partial / no T1)",
"Total enrolled (Cohort 1 + Cohort 2)",
"Cohort 1 survey completion rate",
"Enrolled with baseline HbA1c recorded",
"Cohort 1 with T1 HbA1c recorded"
),
Count = c(
anticipated_per_cohort,
n_cohort1,
n_cohort2,
n_enrolled,
n_cohort1,
n_base_a1c,
n_t1_a1c
),
Denominator = c(
anticipated_per_cohort,
anticipated_per_cohort,
anticipated_per_cohort,
anticipated_per_cohort * 2,
anticipated_per_cohort,
n_enrolled,
n_cohort1
),
Percent = round(100 * Count / Denominator, 1)
)
pdsa_summary
## # A tibble: 7 × 4
## Metric Count Denominator Percent
## <chr> <dbl> <dbl> <dbl>
## 1 Anticipated enrollment per cohort 60 60 100
## 2 Observed Cohort 1 enrollment (completed T1) 38 60 63.3
## 3 Observed Cohort 2 enrollment (partial / no T1) 23 60 38.3
## 4 Total enrolled (Cohort 1 + Cohort 2) 61 120 50.8
## 5 Cohort 1 survey completion rate 38 60 63.3
## 6 Enrolled with baseline HbA1c recorded 61 61 100
## 7 Cohort 1 with T1 HbA1c recorded 38 38 100
#RACE ANALYSIS
race_long <- df_status %>%
select(Enrolled, Race) %>%
filter(!is.na(Race), Race != "") %>%
mutate(Race = str_split(Race, "\\r\\n")) %>%
unnest(Race) %>%
mutate(Race = str_trim(Race)) %>%
filter(str_detect(Race, "^R")) # keeps only race codes
race_long <- race_long %>%
mutate(
Race_clean = case_when(
str_detect(Race, "R1") ~ "American Indian / Alaska Native",
str_detect(Race, "R2") ~ "Asian",
str_detect(Race, "R3") ~ "Black or African American",
str_detect(Race, "R4") ~ "Native Hawaiian / Pacific Islander",
str_detect(Race, "R5") ~ "White",
str_detect(Race, "R9") ~ "Other",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(Race_clean))
race_enroll_tab <- race_long %>%
count(Enrolled, Race_clean) %>%
group_by(Enrolled) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup()
race_enroll_tab
## # A tibble: 10 × 4
## Enrolled Race_clean n percent
## <chr> <chr> <int> <dbl>
## 1 Enrolled American Indian / Alaska Native 1 1.6
## 2 Enrolled Black or African American 25 41
## 3 Enrolled Other 20 32.8
## 4 Enrolled White 15 24.6
## 5 Not enrolled American Indian / Alaska Native 2 0.5
## 6 Not enrolled Asian 9 2.2
## 7 Not enrolled Black or African American 105 25.3
## 8 Not enrolled Native Hawaiian / Pacific Islander 1 0.2
## 9 Not enrolled Other 101 24.3
## 10 Not enrolled White 197 47.5
race_order <- race_enroll_tab %>%
group_by(Race_clean) %>%
summarize(max_pct = max(percent)) %>%
arrange(max_pct) %>%
pull(Race_clean)
race_enroll_tab <- race_enroll_tab %>%
mutate(
Race_clean = factor(Race_clean, levels = race_order),
Enrolled = factor(Enrolled, levels = c("Not enrolled", "Enrolled"))
)
ggplot(race_enroll_tab,
aes(x = Race_clean, y = percent, fill = Enrolled)) +
geom_col(position = position_dodge(width = 0.8)) +
geom_text(
aes(label = paste0(round(percent, 1), "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, # push text just outside bars
size = 3.6,
color = "black"
) +
coord_flip(clip = "off") + # allow labels past plot area
scale_x_discrete(
limits = rev(levels(race_enroll_tab$Race_clean))
) +
scale_fill_manual(
values = c(
"Not enrolled" = "#CFE1F2", # slightly lighter for contrast
"Enrolled" = "#1F4E79"
)
) +
expand_limits(y = max(race_enroll_tab$percent) + 8) +
theme_bw() +
labs(
title = "Race Distribution by Enrollment Status",
x = "Race",
y = "% within enrollment group",
fill = "Enrollment Status"
) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.y = element_text(size = 10),
axis.text.x = element_text(size = 10),
legend.position = "top"
)

###SEX ANALYSIS
sex_tab <- df_status %>% # <- change df to df_status
filter(!is.na(`Legal Sex`)) %>%
group_by(Enrolled, `Legal Sex`) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Enrolled) %>%
mutate(percent = 100 * n / sum(n))
ggplot(sex_tab,
aes(x = `Legal Sex`, y = percent, fill = Enrolled)) +
geom_col(position = position_dodge(width = 0.8)) +
geom_text(aes(label = paste0(round(percent,1), "%")),
position = position_dodge(width = 0.8),
vjust = -0.3,
size = 3.5) +
scale_fill_manual(values = c(
"Not enrolled" = "#CFE1F2",
"Enrolled" = "#1F4E79"
)) +
theme_bw() +
labs(
title = "Legal Sex by Enrollment Status",
x = "Legal Sex",
y = "% within enrollment group",
fill = "Enrollment Status"
)

###ETHNICITY
## 1. Clean ethnicity into simple categories --------------------------
df_status <- df_status %>%
mutate(
Ethnicity_clean = dplyr::case_when(
grepl("E1", Ethnicity) ~ "Spanish/Hispanic/Latino",
grepl("E2", Ethnicity) ~ "Not Spanish/Hispanic/Latino",
grepl("S1", Ethnicity) ~ "Patient declined",
grepl("S2", Ethnicity) ~ "Patient unavailable",
TRUE ~ NA_character_
)
)
## 2. Summarize % within each enrollment group ------------------------
eth_summary <- df_status %>%
dplyr::filter(!is.na(Ethnicity_clean)) %>%
dplyr::count(Enrolled, Ethnicity_clean) %>%
dplyr::group_by(Enrolled) %>%
dplyr::mutate(pct = 100 * n / sum(n)) %>%
dplyr::ungroup()
## 3. Order ethnicity by overall prevalence (optional, just for nicer plot)
eth_order <- eth_summary %>%
dplyr::group_by(Ethnicity_clean) %>%
dplyr::summarise(overall_n = sum(n), .groups = "drop") %>%
dplyr::arrange(dplyr::desc(overall_n)) %>%
dplyr::pull(Ethnicity_clean)
eth_summary$Ethnicity_clean <- factor(eth_summary$Ethnicity_clean,
levels = eth_order)
## 4. Plot: Ethnicity by enrollment status ----------------------------
ggplot(eth_summary,
aes(x = Ethnicity_clean, y = pct, fill = Enrolled)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = sprintf("%.1f%%", pct)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3) +
scale_fill_manual(values = c("Not enrolled" = "#a6cee3",
"Enrolled" = "#1f78b4")) +
labs(
title = "Ethnicity by Enrollment Status",
x = "Ethnicity",
y = "% within enrollment group",
fill = "Enrollment Status"
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 9),
plot.title = element_text(size = 14, face = "bold")
)

## ---- hba1c_hist_by_enrollment (DF2) ---------------------------------
# 0) Ensure df_status + Enrolled exist (uses your rule: >=3 T0 answers)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t0_q_cols <- t0_q_cols[!grepl("Contact|NOTES", t0_q_cols, ignore.case = TRUE)]
df_status <- df %>%
mutate(
T0_answer_count = rowSums(!is.na(dplyr::select(., all_of(t0_q_cols)))),
Enrolled = if_else(T0_answer_count >= 3, "Enrolled", "Not enrolled")
)
# 1) Clean HbA1c (baseline column in DF2)
clean_a1c <- function(x) {
x <- trimws(as.character(x))
x[x %in% c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")] <- NA_character_
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
x <- gsub("%", "", x)
as.numeric(x)
}
df_status <- df_status %>%
mutate(HbA1c = clean_a1c(`HbA1c Value (Last 3 Months)`))
# 2) Pre-bin + percent within enrollment group (stable + easy to label)
df_hist <- df_status %>%
filter(!is.na(HbA1c), !is.na(Enrolled)) %>%
mutate(
Hb_bin = floor(HbA1c),
Hb_mid = Hb_bin + 0.5
) %>%
count(Enrolled, Hb_bin, Hb_mid) %>%
group_by(Enrolled) %>%
mutate(pct = n / sum(n) * 100) %>%
ungroup() %>%
mutate(
Enrolled = factor(Enrolled, levels = c("Enrolled", "Not enrolled")),
label = ifelse(pct > 0, paste0(round(pct, 1), "%"), "")
)
# 3) Plot (two stacked histograms, y-limit like your example)
ggplot(df_hist, aes(x = Hb_mid, y = pct)) +
geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
geom_text(aes(label = label), vjust = -0.35, size = 3.5) +
facet_wrap(~ Enrolled, ncol = 1, scales = "fixed") +
scale_x_continuous(
breaks = seq(min(df_hist$Hb_bin, na.rm = TRUE), max(df_hist$Hb_bin, na.rm = TRUE), by = 1),
minor_breaks = NULL
) +
coord_cartesian(ylim = c(0, 50)) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_bw() +
labs(
title = "HbA1c Distribution by Enrollment Status",
x = "HbA1c (%)",
y = "% of patients within group"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 11)
)

### Blood sugar check frequency (T0 vs T1)
## 1. Identify the two survey columns
t0_col <- grep(
"^T0- In the past week, how often did you check your blood sugar",
names(df_COHORT1),
value = TRUE
)
t1_col <- grep(
"^T1- In the past week, how often did you check your blood sugar",
names(df_COHORT1),
value = TRUE
)
stopifnot(length(t0_col) == 1, length(t1_col) == 1)
t0_raw <- df_COHORT1[[t0_col]]
t1_raw <- df_COHORT1[[t1_col]]
## 2. Cleaning function: collapse equivalent strings
clean_freq <- function(x) {
x_std <- toupper(trimws(as.character(x)))
dplyr::case_when(
# 1–2 times (all variants)
x_std %in% c("1–2 TIMES", "1-2 TIMES", "1 TO 2 TIMES") ~ "1–2 times",
# 3–4 times
x_std %in% c("3–4 TIMES", "3-4 TIMES") ~ "3–4 times",
# Daily
x_std %in% c("DAILY", "DAILY ", "DAILY.") ~ "Daily",
# More than once daily
x_std %in% c("MORE THAN DAILY",
"MORE THAN ONCE DAILY",
"MORE THAN ONCE DAILY",
"MORE THAN ONCE A DAY") ~ "More than once daily",
# Not at all
x_std %in% c("NOT AT ALL") ~ "Not at all",
# Anything unexpected
TRUE ~ NA_character_
)
}
## 3. Long data with cleaned responses
df_bs <- tibble::tibble(
Timepoint = c(rep("T0", length(t0_raw)), rep("T1", length(t1_raw))),
Response_raw = c(t0_raw, t1_raw)
) %>%
mutate(Response_clean = clean_freq(Response_raw)) %>%
filter(!is.na(Response_clean))
## 4. Summary table for plotting
freq_summary <- df_bs %>%
count(Timepoint, Response_clean) %>%
group_by(Timepoint) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
mutate(
Response_clean = factor(
Response_clean,
levels = c("Not at all",
"1–2 times",
"3–4 times",
"Daily",
"More than once daily")
)
)
## 5. Plot: frequency of blood sugar checks at T0 vs T1
ggplot(freq_summary,
aes(x = Response_clean, y = pct, fill = Timepoint)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(pct, "%")),
position = position_dodge(width = 0.8),
vjust = -0.3,
size = 3.5) +
theme_bw() +
labs(
title = "How Often Patients Checked Blood Sugar (T0 vs T1)",
x = "Frequency in past week",
y = "% of participants",
fill = "Timepoint"
) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 10),
plot.title = element_text(size = 14, face = "bold")
)

###blood sugar in range
sugar_levels_T0 <- c("No", "Maybe", "Yes")
sugar_levels_T1 <- c(
"Never",
"Sometimes",
"About half the time",
"Most of the time",
"Always"
)
sugar_T0_raw <- df_COHORT1$`T0- Were your sugar levels usually in your target range last week?`
sugar_T1_raw <- df_COHORT1$`T1- Were your sugar levels usually in your target range last week?`
length(sugar_T0_raw)
## [1] 38
length(sugar_T1_raw)
## [1] 38
clean_T0 <- function(x) {
x_std <- toupper(trimws(x))
case_when(
x_std == "YES" ~ "Yes",
x_std == "NO" ~ "No",
x_std == "MAYBE" ~ "Maybe",
TRUE ~ NA_character_
)
}
clean_T1 <- function(x) {
x_std <- toupper(trimws(x))
case_when(
x_std %in% c("NEVER") ~ "Never",
x_std == "SOMETIMES" ~ "Sometimes",
x_std %in% c("ABOUT HALF THE TIME", "HALF THE TIME") ~ "About half",
x_std %in% c("MOST OF THE TIME", "YES") ~ "Most",
x_std == "ALWAYS" ~ "Always",
TRUE ~ NA_character_
)
}
T0_clean <- clean_T0(sugar_T0_raw)
T1_clean <- clean_T1(sugar_T1_raw)
score_map <- c(
"No" = 0,
"Never" = 0,
"Maybe" = 1,
"Sometimes" = 1,
"About half" = 2,
"Yes" = 3,
"Most" = 3,
"Always" = 4
)
T0_score <- unname(score_map[T0_clean])
T1_score <- unname(score_map[T1_clean])
df_change <- data.frame(
patient_id = seq_len(nrow(df_COHORT1)),
T0_raw = T0_clean,
T1_raw = T1_clean,
T0_score = T0_score,
T1_score = T1_score
) %>%
filter(!is.na(T0_score) & !is.na(T1_score)) %>%
mutate(
delta = T1_score - T0_score,
change = case_when(
delta > 0 ~ "Increased",
delta < 0 ~ "Decreased",
TRUE ~ "No change"
)
)
change_summary <- df_change %>%
count(change) %>%
mutate(percent = round(100 * n / sum(n), 1))
change_summary$change <- factor(
change_summary$change,
levels = c("Decreased", "No change", "Increased")
)
ggplot(change_summary, aes(x = change, y = percent, fill = change)) +
geom_col() +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
scale_fill_manual(
values = c(
"Decreased" = "#d73027",
"No change" = "#4E79A7",
"Increased" = "#1a9850"
)
) +
theme_bw() +
labs(
title = "Change in Blood Sugar in Target Range (T0 → T1)",
x = "",
y = "% of participants"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)

change_grouped_table <- df_change %>%
count(change, T0_raw, T1_raw) %>%
arrange(change, desc(n))
change_grouped_percent <- df_change %>%
count(change, T0_raw, T1_raw) %>%
group_by(change) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
arrange(change, desc(percent))
change_grouped_wide <- change_grouped_percent %>%
mutate(label = paste0(n, " (", percent, "%)")) %>%
select(change, T0_raw, T1_raw, label) %>%
tidyr::pivot_wider(
names_from = T1_raw,
values_from = label,
values_fill = ""
)
change_grouped_wide
## # A tibble: 8 × 7
## change T0_raw Sometimes Never Most `About half` Always
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Decreased Yes "2 (66.7%)" "" "" "" ""
## 2 Decreased Maybe "" "1 (33.3%)" "" "" ""
## 3 Increased No "6 (33.3%)" "" "2 (11.1%)" "" "1 (5.6%)"
## 4 Increased Maybe "" "" "5 (27.8%)" "2 (11.1%)" "1 (5.6%)"
## 5 Increased Yes "" "" "" "" "1 (5.6%)"
## 6 No change Yes "" "" "7 (63.6%)" "" ""
## 7 No change Maybe "3 (27.3%)" "" "" "" ""
## 8 No change No "" "1 (9.1%)" "" "" ""
###Confidence in blood sugar maintenance practices
confidence_levels <- c(
"Not confident",
"Somewhat not confident",
"Maybe",
"Confident",
"Very confident"
)
# T0: Yes / Maybe / No → 1–3
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`
conf_T0_num <- dplyr::case_when(
toupper(trimws(conf_T0_raw)) == "NO" ~ 1,
toupper(trimws(conf_T0_raw)) == "MAYBE" ~ 2,
toupper(trimws(conf_T0_raw)) == "YES" ~ 3,
TRUE ~ NA_real_
)
# T1: 1–4 scale or text variants → 1–4
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`
conf_T1_num <- {
x <- toupper(trimws(as.character(conf_T1_raw)))
dplyr::case_when(
x %in% c("1", "NOT CONFIDENT") ~ 1,
x %in% c("2", "SOMEWHAT NOT CONFIDENT") ~ 2,
x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ 3,
x %in% c("4", "VERY CONFIDENT") ~ 4,
TRUE ~ NA_real_
)
}
# Combine & classify change
df_conf_change <- data.frame(
conf_T0_num = conf_T0_num,
conf_T1_num = conf_T1_num
) %>%
dplyr::filter(!is.na(conf_T0_num), !is.na(conf_T1_num)) %>%
dplyr::mutate(
change = conf_T1_num - conf_T0_num,
ChangeCategory = dplyr::case_when(
change > 0 ~ "Increased confidence",
change == 0 ~ "No change",
change < 0 ~ "Decreased confidence"
)
)
change_summary <- df_conf_change %>%
dplyr::count(ChangeCategory) %>%
dplyr::mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(n, " (", percent, "%)")
)
ggplot(change_summary, aes(x = ChangeCategory, y = n)) +
geom_col(fill = "#2E86C1", width = 0.6) +
geom_text(aes(label = label),
vjust = -0.3, size = 3.8) +
scale_y_continuous(
expand = expansion(mult = c(0, 0.1))
) +
theme_bw() +
labs(
title = "Change in Confidence Making Food Choices (T0 → T1)",
x = "",
y = "Number of participants"
) +
theme(
axis.text.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

# ---- T0 confidence (Yes / Maybe / No) ----
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`
conf_T0_label <- dplyr::case_when(
toupper(trimws(conf_T0_raw)) == "NO" ~ "No",
toupper(trimws(conf_T0_raw)) == "MAYBE" ~ "Maybe",
toupper(trimws(conf_T0_raw)) == "YES" ~ "Yes",
TRUE ~ NA_character_
)
# ---- T1 confidence (1–4 or text variants) ----
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`
conf_T1_label <- {
x <- toupper(trimws(as.character(conf_T1_raw)))
dplyr::case_when(
x %in% c("1", "NOT CONFIDENT") ~ "Not confident",
x %in% c("2", "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ "Confident",
x %in% c("4", "VERY CONFIDENT") ~ "Very confident",
TRUE ~ NA_character_
)
}
conf_transition <- data.frame(
T0 = conf_T0_label,
T1 = conf_T1_label,
stringsAsFactors = FALSE
) %>%
dplyr::filter(!is.na(T0), !is.na(T1))
conf_transition_table <- conf_transition %>%
dplyr::count(T0, T1, name = "n") %>%
dplyr::arrange(desc(n))
conf_transition_table
## T0 T1 n
## 1 Yes Very confident 19
## 2 Yes Confident 5
## 3 Maybe Confident 4
## 4 No Very confident 2
## 5 Yes Somewhat not confident 2
## 6 Maybe Somewhat not confident 1
## 7 Maybe Very confident 1
## 8 No Confident 1
## 9 No Somewhat not confident 1
## 10 Yes Not confident 1
## Appointments T0 vs T1 – enrolled pts (>= 3 T0 answers, >=1 T1 answer)
# 1. Identify T0 / T1 question columns ------------------------------
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)
# 2. Define analysis cohort locally (DOES NOT overwrite df_status) --
df_appt_base <- df %>%
mutate(
T0_answer_count = rowSums(!is.na(dplyr::select(., all_of(t0_q_cols)))),
T1_answer_count = rowSums(!is.na(dplyr::select(., all_of(t1_q_cols))))
) %>%
filter(
T0_answer_count >= 3, # your enrollment rule
T1_answer_count > 0 # must have answered something at T1
)
# 3. Pull raw appointment columns from this filtered cohort ---------
appt_T0_raw <- df_appt_base$`T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?`
appt_T1_raw <- df_appt_base$`T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?`
# 4. Clean Yes/No responses -----------------------------------------
clean_yesno <- function(x) {
x_std <- toupper(trimws(as.character(x)))
dplyr::case_when(
x_std %in% c("YES", "Y", "YS") ~ "Yes",
x_std %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
}
appt_T0_clean <- clean_yesno(appt_T0_raw)
appt_T1_clean <- clean_yesno(appt_T1_raw)
# 5. Build paired change dataset (complete appointment data only) ---
df_appt_change <- tibble(
patient_id = seq_len(nrow(df_appt_base)),
T0_raw = appt_T0_clean,
T1_raw = appt_T1_clean
) %>%
filter(!is.na(T0_raw), !is.na(T1_raw)) # drops any remaining NA pairs
# 6. Score Yes/No and classify change -------------------------------
score_map <- c("No" = 0, "Yes" = 1)
df_appt_change <- df_appt_change %>%
mutate(
T0_score = unname(score_map[T0_raw]),
T1_score = unname(score_map[T1_raw]),
delta = T1_score - T0_score,
change = dplyr::case_when(
delta > 0 ~ "Increased", # No -> Yes
delta < 0 ~ "Decreased", # Yes -> No
TRUE ~ "No change" # Yes->Yes or No->No
)
)
# 7. Summary for bar chart ------------------------------------------
change_summary <- df_appt_change %>%
count(change) %>%
mutate(
percent = round(100 * n / sum(n), 1),
change = factor(change, levels = c("Decreased", "No change", "Increased"))
)
# 8. (Optional) transition table object (not printed automatically) -
transition_table_wide <- df_appt_change %>%
count(change, T0_raw, T1_raw) %>%
group_by(change) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(n, " (", percent, "%)")
) %>%
ungroup() %>%
select(change, T0_raw, T1_raw, label) %>%
pivot_wider(
names_from = T1_raw,
values_from = label,
values_fill = ""
)
# View(transition_table_wide) if you want to inspect it
# 9. Pie-chart data (same cohort as change plot) --------------------
appt_pie_df <- df_appt_change %>%
select(T0_raw, T1_raw) %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(T0_raw, T1_raw),
names_to = "Timepoint",
values_to = "Response") %>%
mutate(
Timepoint = if_else(Timepoint == "T0_raw", "T0", "T1"),
Response = factor(Response, levels = c("Yes", "No"))
)
appt_pie_summary <- appt_pie_df %>%
count(Timepoint, Response) %>%
group_by(Timepoint) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%\n(n=", n, ")")
) %>%
ungroup()
# 10. Plots: bar of change + two pies -------------------------------
p_change <- ggplot(change_summary,
aes(x = change, y = percent, fill = change)) +
geom_col() +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
scale_fill_manual(
values = c(
"Decreased" = "#d73027",
"No change" = "#4E79A7",
"Increased" = "#1a9850"
)
) +
scale_y_continuous(
limits = c(0, 80),
breaks = seq(0, 80, by = 10),
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Change in Having a Diabetes-Related Appointment (T0 → T1)",
x = "",
y = "% of participants"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)
p_pies <- ggplot(appt_pie_summary,
aes(x = "", y = percent, fill = Response)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(
aes(label = label),
position = position_stack(vjust = 0.5),
size = 3.8
) +
facet_wrap(~ Timepoint) +
scale_fill_manual(values = c("Yes" = "#1a9850", "No" = "#d73027")) +
theme_void() +
labs(
title = "Appointments with Diabetes Provider at T0 and T1",
fill = "Response"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12, face = "bold")
)
p_change / p_pies # <- this is the only thing that prints

## ---- impact_meals_made ---------------------------------------------
meals_raw <- df_COHORT1$`T1- In the past week, how many meals did you make using food from the program?`
clean_meals <- function(x) {
x_std <- toupper(trimws(as.character(x)))
dplyr::case_when(
x_std %in% c("0") ~ "0",
x_std %in% c("1","1 MEAL") ~ "1",
x_std %in% c("2 TO 3","2-3","2–3","2 TO3","2 TO 3") ~ "2–3",
x_std %in% c("4 TO 6","4-6","4–6","4 TO 6") ~ "4–6",
str_detect(x_std,"7") ~ "7+",
TRUE ~ NA_character_
)
}
meals_clean <- clean_meals(meals_raw)
meals_summary <- tibble::tibble(meals = meals_clean) %>%
filter(!is.na(meals)) %>%
count(meals) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%")
)
ggplot(meals_summary, aes(x = meals, y = percent)) +
geom_col(fill = "#1F4E79") +
geom_text(aes(label = label), vjust = -0.3, size = 3.8) +
theme_bw() +
labs(
title = "Meals prepared using program food (past week)",
x = "Number of meals",
y = "% of participants"
)

## ---- impact_ease_of_use --------------------------------------------
## ---- impact_ease_of_use_final_fix ----------------------------------
ease_raw <- df_COHORT1$`T1- In the past week, how easy was it to use the food in your meals?`
ease_clean <- dplyr::case_when(
trimws(ease_raw) == "1" ~ "Very easy",
TRUE ~ str_to_sentence(trimws(ease_raw))
)
ease_summary <- tibble(response = ease_clean) %>%
filter(!is.na(response)) %>%
count(response) %>%
mutate(
percent = round(100 * n / sum(n), 1),
response = factor(
response,
levels = c("Neutral", "Easy", "Very easy"),
ordered = TRUE
)
)
ggplot(ease_summary, aes(x = response, y = percent)) +
geom_col(fill = "#2E86C1") +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
theme_bw() +
labs(
title = "Ease of Using Program Food in Meals (T1)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

## -------------------------------
## FOOD WASTE (T1 ONLY – IMPACT)
## -------------------------------
# 1. Identify the T1 waste column explicitly
waste_col <- grep(
"^T1-.*throw away any of the food",
names(df_COHORT1),
value = TRUE
)
# sanity check (must be length 1)
waste_col
## [1] "T1-Did you throw away any of the food from your most recent delivery?"
stopifnot(length(waste_col) == 1)
# 2. Pull raw responses
waste_raw <- df_COHORT1[[waste_col]]
# 3. Clean Yes / No
waste_clean <- case_when(
toupper(trimws(waste_raw)) %in% c("YES", "Y") ~ "Yes",
toupper(trimws(waste_raw)) %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
# 4. Build summary table
waste_summary <- tibble(Response = waste_clean) %>%
filter(!is.na(Response)) %>%
count(Response) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%\n(n=", n, ")")
)
# 5. Plot (simple, low-risk)
ggplot(waste_summary, aes(x = Response, y = percent, fill = Response)) +
geom_col(width = 0.6) +
geom_text(aes(label = label), vjust = -0.3, size = 4) +
scale_fill_manual(
values = c("No" = "#1a9850", "Yes" = "#d73027")
) +
scale_y_continuous(
limits = c(0, 100),
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Did Patients Throw Away Delivered Food? (T1)",
x = "",
y = "% of participants"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)

## ------------------------------
## Program impact on feeling in control of diabetes (T1)
# 1. Pull the T1 column
control_col <- grep(
"^T1-How much did this program help you feel more in control of your diabetes",
names(df_COHORT1),
value = TRUE
)
control_raw <- df_COHORT1[[control_col]]
# 2. Clean / recode responses
control_clean <- toupper(trimws(as.character(control_raw)))
control_clean <- dplyr::case_when(
control_clean %in% c("NOT AT ALL", "0") ~ "Not at all",
control_clean %in% c("A LITTLE", "A LITTLE ", "1") ~ "A little",
control_clean %in% c("SOMEWHAT", "2") ~ "Somewhat",
control_clean %in% c("A LOT", "A LOT ", "3") ~ "A lot",
TRUE ~ NA_character_
)
# 3. Summarize % of participants
control_summary <- tibble::tibble(response = control_clean) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::count(response) %>%
dplyr::mutate(
percent = round(100 * n / sum(n), 1),
response = factor(response,
levels = c("Not at all", "A little", "Somewhat", "A lot"))
)
# 4. Plot
ggplot(control_summary, aes(x = response, y = percent)) +
geom_col(fill = "#4E79A7", width = 0.7) +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
theme_bw() +
labs(
title = "Program Impact on Feeling in Control of Diabetes (T1)",
x = "",
y = "% of participants"
) +
coord_cartesian(ylim = c(0, 100)) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 11, angle = 30, hjust = 1),
axis.text.y = element_text(size = 11)
)

## ---- df2_new_impact, echo = TRUE -----------------------------------
## This chunk assumes df, df_status, df_COHORT1 are already defined
## exactly as in your current script.
## =========================
## A. Latest A1c under 8%?
## =========================
a1c_latest_summary <- df_status %>%
filter(Enrolled == "Enrolled") %>%
mutate(
A1c_under8_latest = case_when(
`A1c Under 8? (8/1/25-10/2/25)` %in% c("Y", "y", "YES", "Yes") ~ "Under 8%",
`A1c Under 8? (8/1/25-10/2/25)` %in% c("N", "n", "NO", "No") ~ "8% or higher",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(A1c_under8_latest)) %>%
count(A1c_under8_latest) %>%
mutate(
percent = round(100 * n / sum(n), 1),
A1c_under8_latest = factor(
A1c_under8_latest,
levels = c("8% or higher", "Under 8%")
)
)
p_a1c_latest <- ggplot(a1c_latest_summary,
aes(x = A1c_under8_latest, y = percent)) +
geom_col(fill = "#4E79A7") +
geom_text(aes(label = paste0(percent, "%")),
vjust = -0.3, size = 3.8) +
theme_bw() +
labs(
title = "Latest A1c Control (8/1/25–10/2/25) – Enrolled Patients",
x = "",
y = "% of enrolled patients"
) +
scale_y_continuous(
limits = c(0, 100),
labels = function(x) paste0(x, "%")
)
## =========================
## B. ED/IP Utilization at T1
## =========================
util_base <- df_status %>%
filter(Enrolled == "Enrolled") %>%
mutate(
ED_num = suppressWarnings(as.numeric(`ED Utilization (T1)`)),
IP_num = suppressWarnings(as.numeric(`IP Utilization (T1)`)),
ED_group = case_when(
is.na(ED_num) ~ NA_character_,
ED_num == 0 ~ "0 ED visits",
ED_num >= 1 ~ "≥1 ED visit"
),
IP_group = case_when(
is.na(IP_num) ~ NA_character_,
IP_num == 0 ~ "0 IP stays",
IP_num >= 1 ~ "≥1 IP stay"
)
)
ed_summary <- util_base %>%
filter(!is.na(ED_group)) %>%
count(ED_group) %>%
mutate(
percent = round(100 * n / sum(n), 1),
ED_group = factor(ED_group, levels = c("0 ED visits", "≥1 ED visit"))
)
ip_summary <- util_base %>%
filter(!is.na(IP_group)) %>%
count(IP_group) %>%
mutate(
percent = round(100 * n / sum(n), 1),
IP_group = factor(IP_group, levels = c("0 IP stays", "≥1 IP stay"))
)
p_ed <- ggplot(ed_summary, aes(x = ED_group, y = percent)) +
geom_col(fill = "#1F4E79") +
geom_text(aes(label = paste0(percent, "%")),
vjust = -0.3, size = 3.5) +
theme_bw() +
labs(
title = "ED Utilization at T1 – Enrolled Patients",
x = "",
y = "% of enrolled patients"
) +
scale_y_continuous(
limits = c(0, 100),
labels = function(x) paste0(x, "%")
)
p_ip <- ggplot(ip_summary, aes(x = IP_group, y = percent)) +
geom_col(fill = "#2E86C1") +
geom_text(aes(label = paste0(percent, "%")),
vjust = -0.3, size = 3.5) +
theme_bw() +
labs(
title = "Inpatient Utilization at T1 – Enrolled Patients",
x = "",
y = "% of enrolled patients"
) +
scale_y_continuous(
limits = c(0, 100),
labels = function(x) paste0(x, "%")
)
## If you want them stacked vertically in one display:
(p_a1c_latest) / (p_ed | p_ip)

## =========================
## C. Social Needs / SDoH Flags
## =========================
sdoh_cols <- c(
"Food", "Housing", "Housing 2", "Finance",
"Transport", "Utilities", "Legal", "Violence", "Caregiver"
)
sdoh_long <- df_status %>%
filter(Enrolled == "Enrolled") %>%
mutate(patient_id = row_number()) %>%
select(patient_id, all_of(sdoh_cols)) %>%
tidyr::pivot_longer(
cols = all_of(sdoh_cols),
names_to = "Domain",
values_to = "raw"
) %>%
mutate(
raw_std = toupper(trimws(as.character(raw))),
Need = case_when(
raw_std %in% c("Y", "YES") ~ "Need identified",
raw_std %in% c("N", "NO") ~ "No need",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(Need))
sdoh_summary <- sdoh_long %>%
count(Domain, Need) %>%
group_by(Domain) %>%
mutate(
percent = round(100 * n / sum(n), 1)
) %>%
ungroup()
# Order domains by % with a "Need identified"
sdoh_order <- sdoh_summary %>%
filter(Need == "Need identified") %>%
arrange(desc(percent)) %>%
pull(Domain)
sdoh_summary$Domain <- factor(sdoh_summary$Domain, levels = sdoh_order)
p_sdoh <- ggplot(sdoh_summary,
aes(x = Domain, y = percent, fill = Need)) +
geom_col(position = "fill") +
scale_y_continuous(
labels = function(x) paste0(x * 100, "%")
) +
coord_flip() +
scale_fill_manual(
values = c("Need identified" = "#d73027", "No need" = "#1a9850")
) +
theme_bw() +
labs(
title = "Social Needs by Domain – Enrolled Patients",
x = "Domain",
y = "Within-domain % of patients",
fill = ""
) +
theme(
axis.text.y = element_text(size = 9),
plot.title = element_text(size = 14, face = "bold")
)
p_sdoh

## ---- find_A1c_columns ---------------------------------------------------
## ---- A1c_change_baseline_to_T1 --------------------------------------
# 0. Quickly confirm the A1c-related columns that exist in DF2
grep("A1c|HbA1c|Hemoglobin", names(df), value = TRUE)
## [1] "HbA1c Value (Last 3 Months)" "A1c Under 8? (8/1/25-10/2/25)"
## [3] "Hemoglobin A1c (T1)" "A1c Change (T1)"
## [5] "A1c Under 8? (T1)"
# 1. Helper to clean A1c values safely
clean_a1c <- function(x) {
x <- trimws(as.character(x))
# Treat common NA codes as NA
x[x %in% c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")] <- NA
# Handle inequality codes if they exist
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
# Strip percent signs
x <- gsub("%", "", x)
as.numeric(x)
}
# 2. Build data frame with baseline, T1, and change
df_a1c <- df %>%
mutate(
A1c_baseline = clean_a1c(`HbA1c Value (Last 3 Months)`),
A1c_T1 = clean_a1c(`Hemoglobin A1c (T1)`),
A1c_change = A1c_T1 - A1c_baseline
) %>%
# Keep only patients with both measurements
filter(!is.na(A1c_baseline), !is.na(A1c_T1))
## ---- A1c_change_categorical ---------------------------------------
df_a1c_cat <- df_a1c %>%
mutate(
A1c_change_cat = case_when(
A1c_change <= -1.0 ~ "Improved (≥1.0% ↓)",
A1c_change >= 1.0 ~ "Worsened (≥1.0% ↑)",
TRUE ~ "No meaningful change"
)
)
a1c_cat_summary <- df_a1c_cat %>%
count(A1c_change_cat) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%")
)
ggplot(a1c_cat_summary,
aes(x = A1c_change_cat, y = percent, fill = A1c_change_cat)) +
geom_col(width = 0.7) +
geom_text(aes(label = label), vjust = -0.3, size = 4) +
scale_fill_manual(values = c(
"Improved (≥1.0% ↓)" = "#1a9850",
"No meaningful change" = "#4E79A7",
"Worsened (≥1.0% ↑)" = "#d73027"
)) +
theme_bw() +
labs(
title = "Change in HbA1c from Baseline to T1",
x = "",
y = "% of patients"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 11)
)
