library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
###Set up
df <- read_excel("DMFIM_DEID_11_25.xlsx", col_types = "text")
names(df)
## [1] "Pre Enrollment Notes"
## [2] "Age"
## [3] "Preferred Language"
## [4] "Disch Date/Time"
## [5] "Phone"
## [6] "CC"
## [7] "Admission Diagnosis"
## [8] "Discharge Disposition"
## [9] "LACE+ READMISSION SCORE Score Column"
## [10] "Unplanned Readmission Score"
## [11] "Primary Cvg"
## [12] "HbA1c Value (Last 3 Months)"
## [13] "Pt. Portal Status"
## [14] "Hosp Last 365 Days"
## [15] "ED Vis Last 90 Days"
## [16] "Hosp Last 90 Days"
## [17] "PCP"
## [18] "Race"
## [19] "Ethnicity"
## [20] "Legal Sex"
## [21] "SDOH High Risk Domains"
## [22] "Zip code"
## [23] "Prescreen Contacted Y/N"
## [24] "PS \"In the past 12 months, were you ever worried your food would run out before you had money to buy more?\""
## [25] "PS \"In the past 12 months, did the food you could afford not last until the end of the month, and you couldn’t get more?\""
## [26] "PS \"In the last month, did anyone in your household have to skip meals due to lack of food?\""
## [27] "(Pre Screen) Interested in Program?"
## [28] "T0 Contact"
## [29] "(Screen for Enrollment) Interested in Program"
## [30] "T0- Are you likely to eat the food as part of your normal daily routine?"
## [31] "T0- In the past week, how often did you check your blood sugar?"
## [32] "T0- Are you Confident in making food choices that help control your blood sugar?"
## [33] "T0- Are you confident in understanding how to read food labels and nutrition information?"
## [34] "T0- Were your sugar levels usually in your target range last week?"
## [35] "T0- In the past week, did you miss any doses of you diabetes medications?"
## [36] "T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?"
## [37] "T1 Contact (Y/N)"
## [38] "T1 NOTES"
## [39] "T1- In the past week, how many meals did you make using food from the program?"
## [40] "T1-Did you throw away any of the food from your most recent delivery?"
## [41] "T1- In the past week, how easy was it to use the food in your meals?"
## [42] "T1- In the past week, how often did you check your blood sugar?"
## [43] "T1- Were your sugar levels usually in your target range last week?"
## [44] "T1-Did you have any symptoms of high or low blood sugar in the past week?"
## [45] "T1- In the past week, did you miss any doses of you diabetes medications?"
## [46] "T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?"
## [47] "T1-Have you done anything in the past week to help manage your diabetes — like walking, portion control, or drinking more water? "
## [48] "T1-How much did this program help you feel more in control of your diabetes? "
## [49] "T1-Did the nutrition material you got from White Plains Hospital help you make healthier food choices?"
## [50] "T1- How Confident are you in making food choices that help control your blood sugar?"
## [51] "T1-How confident are you in understanding food labels or nutrition information? "
## [52] "T1-How confident are you in your ability to prepare healthy meals with the food you typically have at home? "
## [53] "T1-Do you have any suggestions or feedback to improve this program for others? "
## [54] "T2 Contact (Y/N)"
## [55] "T2 NOTES"
## [56] "T2- In the past week, how many meals did you make using food from the program?"
## [57] "T2-Did you throw away any of the food from your most recent delivery?"
## [58] "T2- In the past week, how easy was it to use the food in your meals?"
## [59] "T2- In the past week, how often did you check your blood sugar?"
## [60] "T2- Were your sugar levels usually in your target range last week?"
## [61] "T2-In the past week, did you have any symptoms of high or low blood sugar?"
## [62] "T2- In the past week, did you miss any doses of you diabetes medications?"
## [63] "T2- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?"
## [64] "T2-Have you done anything in the past week to help manage your diabetes — like walking, portion control, or drinking more water? "
## [65] "T2-How much did this program help you feel more in control of your diabetes? "
## [66] "T2-Did the nutrition material you got from White Plains Hospital help you make healthier food choices?"
## [67] "T2- How Confident are you in making food choices that help control your blood sugar?"
## [68] "T2-How confident are you in understanding food labels or nutrition information? "
## [69] "T2-How confident are you in your ability to prepare healthy meals with the food you typically have at home? "
## [70] "T2-Do you have any suggestions or feedback to improve this program for others? "
## [71] "Enrolled"
## [72] "Cohort"
## [73] "Delivery 1"
## [74] "D1- Comments"
## [75] "Delivery 2"
## [76] "02- Comments"
## [77] "Delivery 3"
## [78] "03- Comments"
## [79] "Delivery 4"
## [80] "Hemoglobin A1c (8/1/25-10/2/2025)"
## [81] "Hospital Utilization (8/1/25-10/2/25)"
## [82] "A1c Change"
## [83] "A1c Under 8?"
## [84] "Hemoglobin A1c (T1)"
## [85] "Hospital Utilization (T1)"
## [86] "A1c Change (T1)"
## [87] "A1c Under 8? (T1)"
## [88] "additional feedback"
survey_cols <- grep("^T[012]-", names(df), value = TRUE)
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")
df <- df %>%
mutate(
across(
all_of(survey_cols),
~ {
x <- trimws(as.character(.x))
x[x %in% na_codes] <- NA_character_
x
}
)
)
# T0 survey questions (optionally drop Contact / NOTES)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t0_q_cols <- t0_q_cols[!grepl("Contact|NOTES", t0_q_cols, ignore.case = TRUE)]
# T1 survey questions (optionally drop Contact / NOTES)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)
t1_q_cols <- t1_q_cols[!grepl("Contact|NOTES", t1_q_cols, ignore.case = TRUE)]
###Creating Cohort 1 -- all patients with both T0 and T1 data
df_COHORT1 <- df %>%
filter(
if_all(all_of(t0_q_cols), ~ !is.na(.x)),
if_all(all_of(t1_q_cols), ~ !is.na(.x))
) %>%
mutate(
Age = Age %>%
as.character() %>%
trimws() %>%
na_if("") %>%
na_if("NA") %>%
as.numeric()
)
###COHORT1 Demographics -- age
### Compute boxplot stats manually
min(df_COHORT1$Age, na.rm = TRUE)
## [1] 23
max(df_COHORT1$Age, na.rm = TRUE)
## [1] 91
age_bp <- boxplot.stats(df_COHORT1$Age)
boxplot.stats(df_COHORT1$Age)$stats
## [1] 27 50 60 66 79
age_stats <- data.frame(
stat = c("Min", "Q1", "Median", "Q3", "Max"),
value = round(age_bp$stats, 1)
)
age_stats
## stat value
## 1 Min 27
## 2 Q1 50
## 3 Median 60
## 4 Q3 66
## 5 Max 79
ggplot(df_COHORT1, aes(x = Age)) +
geom_histogram(
binwidth = 5,
boundary = 0,
fill = "#5DADE2",
color = "white",
alpha = 0.9
) +
# Rug marks
geom_rug(
sides = "b",
alpha = 0.7,
color = "#1F618D",
length = unit(0.05, "npc")
) +
# Mean + median
geom_vline(aes(xintercept = mean(Age, na.rm = TRUE)),
color = "#C0392B", size = 1.1, linetype = "dashed") +
geom_vline(aes(xintercept = median(Age, na.rm = TRUE)),
color = "#27AE60", size = 1.1, linetype = "dashed") +
# Labels for mean/median
annotate("text",
x = mean(df_COHORT1$Age, na.rm = TRUE),
y = 0.5,
label = paste0("Mean: ", round(mean(df_COHORT1$Age, na.rm = TRUE), 1)),
color = "#C0392B",
hjust = -0.1,
size = 4) +
annotate("text",
x = median(df_COHORT1$Age, na.rm = TRUE),
y = 1.2,
label = paste0("Median: ", round(median(df_COHORT1$Age, na.rm = TRUE), 1)),
color = "#27AE60",
hjust = 1.1,
size = 4) +
# X-axis ticks every 10 years
scale_x_continuous(
breaks = seq(floor(min(df_COHORT1$Age, na.rm = TRUE) / 10) * 10,
ceiling(max(df_COHORT1$Age, na.rm = TRUE) / 10) * 10,
by = 10)
) +
labs(
title = "Age Distribution – Cohort 1",
subtitle = "Histogram shows the number of participants in each 5-year age range.\nDashed lines show mean (red) and median (green).",
x = "Age (years)",
y = "Number of participants"
) +
theme_bw() +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 11),
axis.text = element_text(size = 11),
axis.title = element_text(size = 12)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

###COHORT1 Demographics -- sex
#SEX
sex_summary <- df_COHORT1 %>%
count(`Legal Sex`) %>%
mutate(percent = round(100 * n / sum(n), 1))
sex_summary
## # A tibble: 2 × 3
## `Legal Sex` n percent
## <chr> <int> <dbl>
## 1 Female 19 57.6
## 2 Male 14 42.4
ggplot(sex_summary, aes(x = "", y = percent, fill = `Legal Sex`)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_label(
aes(label = paste0(percent, "%")),
color = "black",
position = position_stack(vjust = 0.5),
show.legend = FALSE
) +
labs(
title = "Legal Sex Distribution – Cohort 1",
fill = "Legal Sex"
) +
theme_void() +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "right"
)

#RACE
race_df <- df_COHORT1 %>% select(Race)
split_race_base <- function(x) {
locs <- gregexpr("R[0-9]+", x)[[1]]
if (locs[1] == -1) return(NA_character_)
race_list <- character(length(locs))
for (i in seq_along(locs)) {
start <- locs[i]
end <- if (i < length(locs)) locs[i+1] - 1 else nchar(x)
race_list[i] <- trimws(substr(x, start, end))
}
race_list
}
race_expanded <- race_df %>%
mutate(Race = lapply(Race, split_race_base)) %>%
unnest(Race)
race_summary <- race_expanded %>%
filter(!is.na(Race)) %>%
count(Race) %>%
mutate(percent = round(100 * n / sum(n), 1))
race_summary
## # A tibble: 5 × 3
## Race n percent
## <chr> <int> <dbl>
## 1 R1 American Indian or Alaska Native 1 2.7
## 2 R3 Black or African-American 14 37.8
## 3 R4 Native Hawaiian 1 2.7
## 4 R5 White 4 10.8
## 5 R9 Other 17 45.9
ggplot(race_summary, aes(x = reorder(Race, percent), y = percent)) +
geom_col(fill = "steelblue") +
coord_flip() +
geom_text(aes(label = paste0(percent, "%")),
hjust = -0.1, size = 4) +
labs(
title = "Race Distribution – Cohort 1",
x = "Race Category",
y = "Percentage of Sample (%)"
) +
theme_bw() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.y = element_text(size = 10)
) +
ylim(0, max(race_summary$percent) + 5)

### ETHNICITY – Cohort 1
# Clean and summarize without splitting into 'E' fragments
# 1. Pull raw column
eth_raw <- df_COHORT1$Ethnicity
# 2. Remove bracketed indices like [1], [2]
eth_clean <- gsub("\\[[^]]*\\]", "", eth_raw)
# 3. Trim whitespace
eth_clean <- trimws(eth_clean)
# 4. Drop blanks / NAs
eth_clean <- eth_clean[!is.na(eth_clean) & eth_clean != ""]
# 5. Map codes to readable labels
eth_label <- ifelse(
grepl("E1", eth_clean), "Spanish/Hispanic/Latino",
ifelse(grepl("E2", eth_clean), "Not Spanish/Hispanic/Latino", NA)
)
# 6. Drop any residual NA (just in case)
eth_label <- eth_label[!is.na(eth_label)]
# 7. Summarize counts and percentages
tab_eth <- table(eth_label)
eth_summary <- data.frame(
Ethnicity = names(tab_eth),
n = as.vector(tab_eth)
)
eth_summary$percent <- round(100 * eth_summary$n / sum(eth_summary$n), 1)
eth_summary
## Ethnicity n percent
## 1 Not Spanish/Hispanic/Latino 17 51.5
## 2 Spanish/Hispanic/Latino 16 48.5
ggplot(eth_summary, aes(x = reorder(Ethnicity, percent), y = percent)) +
geom_col() +
coord_flip() +
theme_bw() +
labs(
title = "Ethnicity Distribution – Cohort 1",
x = "Ethnicity",
y = "Percentage of sample (%)"
) +
theme(
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 14, face = "bold")
)

sort(unique(eth_clean))
## [1] "E1 Spanish/Hispanic/Latino" "E2 Not Spanish/Hispanic/Latino"
#A1C
df_COHORT1 <- df_COHORT1 %>%
mutate(
HbA1c = as.numeric(`HbA1c Value (Last 3 Months)`)
)
hba1c_summary <- df_COHORT1 %>%
summarize(
n = sum(!is.na(HbA1c)),
mean = mean(HbA1c, na.rm = TRUE),
sd = sd(HbA1c, na.rm = TRUE),
min = min(HbA1c, na.rm = TRUE),
max = max(HbA1c, na.rm = TRUE)
)
hba1c_summary
## # A tibble: 1 × 5
## n mean sd min max
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 33 10.7 1.73 8 14.7
# Compute boxplot stats manually# Compute boxplot stats manually
bp <- boxplot.stats(df_COHORT1$HbA1c)
stats_df <- data.frame(
stat = c("Min", "Q1", "Median", "Q3", "Max"),
value = round(c(bp$stats[1], bp$stats[2], bp$stats[3], bp$stats[4], bp$stats[5]), 2)
)
ggplot(df_COHORT1, aes(x = "", y = HbA1c)) +
geom_boxplot(fill = "#AED6F1", width = 0.3, outlier.shape = NA) +
# Add red points at each statistic
geom_point(data = stats_df,
aes(x = "", y = value),
color = "red", size = 3) +
# Label each point with the HbA1c value
geom_text(data = stats_df,
aes(x = "", y = value, label = paste0(stat, ": ", value)),
nudge_x = 0.25, size = 3.5, hjust = 0) +
theme_bw() +
labs(
title = "HbA1c Boxplot with Statistical Markers — Cohort 1",
x = "",
y = "HbA1c (%)"
) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(size = 16, face = "bold")
) +
coord_cartesian(clip = "off") # Allow labels to extend outside plot area

###Blood sugar check
## 1. Identify the two survey columns
t0_col <- grep("^T0- In the past week, how often did you check your blood sugar",
names(df_COHORT1), value = TRUE)
t1_col <- grep("^T1- In the past week, how often did you check your blood sugar",
names(df_COHORT1), value = TRUE)
t0_raw <- df_COHORT1[[t0_col]]
t1_raw <- df_COHORT1[[t1_col]]
## 2. Cleaning function: only collapses clearly-equivalent strings
clean_freq <- function(x) {
x_std <- toupper(trimws(x))
dplyr::case_when(
# 1–2 times (all variants)
x_std %in% c("1–2 TIMES", "1-2 TIMES", "1 TO 2 TIMES", "1 TO 2 TIMES") ~ "1–2 times",
# 3–4 times
x_std %in% c("3–4 TIMES", "3-4 TIMES", "3–4 TIMES") ~ "3–4 times",
# Daily
x_std %in% c("DAILY", "DAILY ", "DAILY", "DAILY") ~ "Daily",
# More than once daily
x_std %in% c("MORE THAN DAILY",
"MORE THAN ONCE DAILY",
"MORE THAN ONCE DAILY",
"MORE THAN ONCE A DAY") ~ "More than once daily",
# Not at all
x_std %in% c("NOT AT ALL") ~ "Not at all",
# Anything unexpected gets labeled as "Other (check)"
TRUE ~ "Other (check)"
)
}
## 3. Build long data frame with BOTH raw and cleaned responses
df_bs <- data.frame(
Timepoint = c(rep("T0", length(t0_raw)), rep("T1", length(t1_raw))),
Response_raw = c(t0_raw, t1_raw),
Response_clean = clean_freq(c(t0_raw, t1_raw)),
stringsAsFactors = FALSE
)
## 4. Quick audit: raw → clean mapping (so you can verify no surprises)
table(df_bs$Response_raw, df_bs$Response_clean)
##
## 1–2 times 3–4 times Daily More than once daily
## 1 to 2 TIMES 1 0 0 0
## 1-2 times 2 0 0 0
## 1-2 TIMES 4 0 0 0
## 3-4 times 0 4 0 0
## 3-4 TIMES 0 1 0 0
## daily 0 0 5 0
## Daily 0 0 9 0
## DAILY 0 0 7 0
## MORE THAN DAILY 0 0 0 3
## MORE THAN ONCE DAILY 0 0 0 3
## MORE THAN ONCE A DAY 0 0 0 5
## More than once daily 0 0 0 18
## Not at all 0 0 0 0
## NOT AT ALL 0 0 0 0
##
## Not at all
## 1 to 2 TIMES 0
## 1-2 times 0
## 1-2 TIMES 0
## 3-4 times 0
## 3-4 TIMES 0
## daily 0
## Daily 0
## DAILY 0
## MORE THAN DAILY 0
## MORE THAN ONCE DAILY 0
## MORE THAN ONCE A DAY 0
## More than once daily 0
## Not at all 2
## NOT AT ALL 2
## 5. Plot using the cleaned categories
df_bs$Response_clean <- factor(
df_bs$Response_clean,
levels = c("Not at all",
"1–2 times",
"3–4 times",
"Daily",
"More than once daily") # we dropped "Other (check)" since none left
)
ggplot(df_bs, aes(x = Response_clean, fill = Timepoint)) +
geom_bar(position = "dodge") + # geom_bar uses after_stat(count) internally
theme_bw() +
labs(
title = "Blood Sugar Monitoring Frequency (T0 vs T1)",
x = "Monitoring frequency (grouped)",
y = "Count"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
plot.title = element_text(size = 15, face = "bold")
)

#####. Blood sugar in target range at T0 vs T1
sugar_T0_raw <- df_COHORT1$`T0- Were your sugar levels usually in your target range last week?`
sugar_T1_raw <- df_COHORT1$`T1- Were your sugar levels usually in your target range last week?`
sugar_T0_std <- toupper(trimws(sugar_T0_raw))
sugar_T0_clean <- case_when(
sugar_T0_std == "YES" ~ "Yes",
sugar_T0_std == "NO" ~ "No",
sugar_T0_std == "MAYBE" ~ "Maybe",
TRUE ~ NA_character_
)
table(sugar_T0_clean, useNA = "ifany")
## sugar_T0_clean
## Maybe No Yes
## 11 11 11
# Standardize all raw responses once
sugar_T1_std <- toupper(trimws(as.character(sugar_T1_raw)))
# Clean mapping
sugar_T1_clean <- case_when(
sugar_T1_std %in% c("NEVER", "N") ~ "Never",
sugar_T1_std == "SOMETIMES" ~ "Sometimes",
sugar_T1_std %in% c("ABOUT HALF THE TIME",
"HALF THE TIME") ~ "About half the time",
sugar_T1_std %in% c("MORE THAN HALF THE TIME",
"MOST OF THE TIME",
"MOST OF TIME",
"YES", "Y") ~ "Most of the time",
sugar_T1_std == "ALWAYS" ~ "Always",
TRUE ~ NA_character_
)
table_clean <- table(sugar_T1_clean, useNA="ifany")
table_clean
## sugar_T1_clean
## About half the time Always Most of the time Never
## 2 2 17 3
## Sometimes
## 9
table(sugar_T1_clean, useNA = "ifany")
## sugar_T1_clean
## About half the time Always Most of the time Never
## 2 2 17 3
## Sometimes
## 9
df_sugar_T0 <- data.frame(
Timepoint = "T0",
Response = factor(sugar_T0_clean,
levels = c("No", "Maybe", "Yes"))
)
df_sugar_T1 <- data.frame(
Timepoint = "T1",
Response = factor(sugar_T1_clean,
levels = c("Never",
"Sometimes",
"About half the time",
"Most of the time",
"Always"))
)
df_sugar_long <- rbind(df_sugar_T0, df_sugar_T1)
df_counts <- df_sugar_long %>%
count(Timepoint, Response)
ggplot(df_counts, aes(x = Response, y = n)) +
geom_col(fill = "#2E86C1") +
geom_text(aes(label = n), vjust = -0.3, size = 4) +
facet_wrap(~ Timepoint, scales = "free_x") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
strip.text = element_text(size = 14, face = "bold")
) +
labs(
title = "Frequency of Blood Sugar in Target Range Last Week (T0 vs T1) – Cohort 1",
x = "Response",
y = "Count"
)

###FOLLOW UP APPT
appt_T0_raw <- df_COHORT1$`T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?`
appt_T1_raw <- df_COHORT1$`T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?`
cleanYN <- function(x) {
out <- toupper(trimws(x))
dplyr::case_when(
out %in% c("YES", "Y", "YS") ~ "Yes", # include typo “ys”
out %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
}
appt_T0_clean <- cleanYN(appt_T0_raw)
appt_T1_clean <- cleanYN(appt_T1_raw)
sort(unique(appt_T0_clean))
## [1] "No" "Yes"
sort(unique(appt_T1_clean))
## [1] "No" "Yes"
table(appt_T0_clean, useNA="ifany")
## appt_T0_clean
## No Yes <NA>
## 6 26 1
table(appt_T1_clean, useNA="ifany")
## appt_T1_clean
## No Yes
## 10 23
df_appt_T0 <- data.frame(
Timepoint = "T0",
Response = appt_T0_clean
)
df_appt_T1 <- data.frame(
Timepoint = "T1",
Response = appt_T1_clean
)
df_appt_long <- rbind(df_appt_T0, df_appt_T1)
df_appt_long <- df_appt_long[!is.na(df_appt_long$Response), ]
df_appt_long$Response <- factor(df_appt_long$Response,
levels = c("No", "Yes"))
ggplot(df_appt_long, aes(x = Timepoint, fill = Response)) +
geom_bar(position = "dodge") +
scale_fill_manual(
name = "Appointment Completed?",
values = c("No" = "#d73027", # red
"Yes" = "#2E86C1"), # blue
breaks = c("No", "Yes") # ensures No listed first in legend
) +
theme_bw() +
labs(
title = "Appointment with Diabetes Provider (T0 vs T1) – Cohort 1",
x = "Timepoint",
y = "Count"
) +
theme(
axis.text.x = element_text(size = 12),
plot.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 11)
)

###MEDICATION ADHERENCE
miss_T0_raw <- df_COHORT1$`T0- In the past week, did you miss any doses of you diabetes medications?`
miss_T1_raw <- df_COHORT1$`T1- In the past week, did you miss any doses of you diabetes medications?`
miss_T0_clean <- cleanYN(miss_T0_raw)
miss_T1_clean <- cleanYN(miss_T1_raw)
table(miss_T0_clean, useNA = "ifany")
## miss_T0_clean
## No Yes
## 26 7
table(miss_T1_clean, useNA = "ifany")
## miss_T1_clean
## No Yes
## 27 6
df_miss_T0 <- data.frame(Timepoint = "T0", Response = miss_T0_clean)
df_miss_T1 <- data.frame(Timepoint = "T1", Response = miss_T1_clean)
df_miss_long <- rbind(df_miss_T0, df_miss_T1) |>
dplyr::filter(!is.na(Response))
df_miss_long$Response <- factor(df_miss_long$Response,
levels = c("No", "Yes"))
ggplot(df_miss_long, aes(x = Timepoint, fill = Response)) +
geom_bar(position = "dodge") +
scale_fill_manual(
name = "Missed Doses?",
values = c("Yes" = "#d73027", # red
"No" = "#2E86C1"), # blue
breaks = c("No", "Yes") # ensures legend order
) +
theme_bw() +
labs(
title = "Missed Diabetes Medications (T0 vs T1) – Cohort 1",
x = "Timepoint",
y = "Count"
) +
theme(
axis.text.x = element_text(size = 12),
plot.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 11)
)

## 1. Pull the 3 columns safely -----------------------------
conf_T0_col <- grep("^T0- Are you confident in understanding how to read food labels",
names(df_COHORT1), value = TRUE)
conf_T1_col <- grep("^T1-How confident are you in understanding food labels",
names(df_COHORT1), value = TRUE)
conf_T2_col <- grep("^T2-How confident are you in understanding food labels",
names(df_COHORT1), value = TRUE)
conf_labels_T0_raw <- df_COHORT1[[conf_T0_col]]
conf_labels_T1_raw <- df_COHORT1[[conf_T1_col]]
conf_labels_T2_raw <- df_COHORT1[[conf_T2_col]]
## 2. Clean responses ---------------------------------------
# T0: Yes / No / Maybe
conf_labels_T0 <- dplyr::case_when(
toupper(trimws(conf_labels_T0_raw)) == "YES" ~ "Yes",
toupper(trimws(conf_labels_T0_raw)) == "NO" ~ "No",
toupper(trimws(conf_labels_T0_raw)) == "MAYBE" ~ "Maybe",
TRUE ~ NA_character_
)
# T1/T2: 1–4 scale or text versions of same idea
recode_conf_1to4 <- function(x) {
x_std <- toupper(trimws(x))
dplyr::case_when(
x_std %in% c("1", "NOT CONFIDENT") ~ "Not confident",
x_std %in% c("2", "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
x_std %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ "Confident",
x_std %in% c("4", "VERY CONFIDENT") ~ "Very confident",
TRUE ~ NA_character_
)
}
conf_labels_T1 <- recode_conf_1to4(conf_labels_T1_raw)
conf_labels_T2 <- recode_conf_1to4(conf_labels_T2_raw)
## 3. Long format -------------------------------------------
df_conf_labels_long <- data.frame(
Timepoint = rep(c("T0", "T1", "T2"), each = nrow(df_COHORT1)),
Response = c(conf_labels_T0, conf_labels_T1, conf_labels_T2),
stringsAsFactors = FALSE
)
df_conf_labels_long <- df_conf_labels_long[!is.na(df_conf_labels_long$Response), ]
df_conf_labels_long$Response <- factor(
df_conf_labels_long$Response,
levels = c("Not confident",
"Somewhat not confident",
"Maybe",
"No",
"Yes",
"Confident",
"Very confident")
)
# ----------------------------
# 1. Define groups for T0 vs T1
# ----------------------------
t0_levels <- c("No", "Maybe", "Yes")
t1_levels <- c(
"Not confident",
"Somewhat not confident",
"Confident",
"Very confident"
)
# ----------------------------
# 2. Filter each panel separately
# ----------------------------
df_conf_T0 <- df_conf_labels_long %>%
filter(Timepoint == "T0", Response %in% t0_levels) %>%
mutate(Response = factor(Response, levels = t0_levels))
df_conf_T1 <- df_conf_labels_long %>%
filter(Timepoint == "T1", Response %in% t1_levels) %>%
mutate(Response = factor(Response, levels = t1_levels))
df_conf_filtered <- bind_rows(df_conf_T0, df_conf_T1)
# ----------------------------
# 3. Plot
# ----------------------------
ggplot(df_conf_filtered, aes(x = Response)) +
geom_bar(fill = "#2E86C1") +
facet_wrap(~ Timepoint, nrow = 1, scales = "free_x") +
theme_bw() +
labs(
title = "Confidence Reading Food Labels (T0 vs T1)",
x = "Response",
y = "Count of Participants"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
strip.text = element_text(size = 14, face = "bold"),
plot.title = element_text(size = 14, face = "bold"),
legend.position = "none"
)

## Find the T1 "control" question column by pattern## Pull the column
ctrl_col <- grep("^T1.?How much did this program help you feel more in control of your diabetes",
names(df_COHORT1), value = TRUE)
help_raw <- df_COHORT1[[ctrl_col]]
## Standardize
help_std <- toupper(trimws(as.character(help_raw)))
## Recode (NEW RULES: 4 = A little, 5 = A lot)
help_clean <- rep(NA_character_, length(help_std))
help_clean[help_std %in% c("A LITTLE", "A LITTLE ", "4")] <- "A little"
help_clean[help_std %in% c("SOMEWHAT", "2")] <- "Somewhat"
help_clean[help_std %in% c("A LOT", "A LOT ", "5")] <- "A lot"
## Ordered factor
help_clean <- factor(help_clean,
levels = c("A little", "Somewhat", "A lot"))
## Summary table
help_tab <- data.frame(table(help_clean), stringsAsFactors = FALSE)
names(help_tab) <- c("Response", "n")
help_tab <- help_tab[!is.na(help_tab$Response), ]
help_tab$percent <- round(100 * help_tab$n / sum(help_tab$n), 1)
ggplot(help_tab, aes(x = Response, y = percent)) +
geom_col(fill = "#2E86C1") +
geom_text(aes(label = paste0(percent, "%")),
vjust = -0.3, size = 4) +
scale_y_continuous(
limits = c(0, max(help_tab$percent) + 10),
breaks = seq(0, 100, by = 10),
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Program impact on perceived diabetes control (T1)",
x = "",
y = "% of participants"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 11),
axis.title = element_text(size = 11)
)

## ---- Clean Yes/No responses ----
## 1. Grab the T1 column by pattern
nut_col_T1 <- grep("^T1-Did the nutrition material you got from White Plains Hospital",
names(df_COHORT1), value = TRUE)
nut_col_T1
## [1] "T1-Did the nutrition material you got from White Plains Hospital help you make healthier food choices?"
# should print the single T1 column name
## 2. Pull raw values
nut_raw <- df_COHORT1[[nut_col_T1]]
## 3. Standardize and recode YES/NO/Y/N
nut_std <- toupper(trimws(nut_raw))
nut_clean <- dplyr::case_when(
nut_std %in% c("YES", "Y") ~ "Yes",
nut_std %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
nut_clean <- factor(nut_clean, levels = c("No", "Yes"))
## 4. Table with percents
nut_tab <- data.frame(table(nut_clean), stringsAsFactors = FALSE)
names(nut_tab) <- c("Response", "n")
nut_tab <- nut_tab[!is.na(nut_tab$Response), ]
nut_tab$percent <- round(100 * nut_tab$n / sum(nut_tab$n), 1)
nut_tab
## Response n percent
## 1 No 3 9.1
## 2 Yes 30 90.9
## ---- Plot: % Yes / No ----
ggplot(nut_tab, aes(x = Response, y = percent, fill = Response)) +
geom_col() +
geom_text(aes(label = paste0(percent, "%")),
vjust = -0.3, size = 3.8) +
scale_fill_manual(
values = c(
"No" = "#d73027", # red
"Yes" = "#27AE60" # green
)
) +
scale_y_continuous(
limits = c(0, max(nut_tab$percent) + 10),
breaks = seq(0, 100, by = 10),
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Did Nutrition Material Help You Make Healthier Food Choices? (T1)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
axis.title = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold"),
legend.position = "none" # optional: remove legend
)
