library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
###Set up
df <- read_excel("DMFIM_DEID_11_25.xlsx", col_types = "text")
names(df)
##  [1] "Pre Enrollment Notes"                                                                                                              
##  [2] "Age"                                                                                                                               
##  [3] "Preferred Language"                                                                                                                
##  [4] "Disch Date/Time"                                                                                                                   
##  [5] "Phone"                                                                                                                             
##  [6] "CC"                                                                                                                                
##  [7] "Admission Diagnosis"                                                                                                               
##  [8] "Discharge Disposition"                                                                                                             
##  [9] "LACE+ READMISSION SCORE Score Column"                                                                                              
## [10] "Unplanned Readmission Score"                                                                                                       
## [11] "Primary Cvg"                                                                                                                       
## [12] "HbA1c Value (Last 3 Months)"                                                                                                       
## [13] "Pt. Portal Status"                                                                                                                 
## [14] "Hosp Last 365 Days"                                                                                                                
## [15] "ED Vis Last 90 Days"                                                                                                               
## [16] "Hosp Last 90 Days"                                                                                                                 
## [17] "PCP"                                                                                                                               
## [18] "Race"                                                                                                                              
## [19] "Ethnicity"                                                                                                                         
## [20] "Legal Sex"                                                                                                                         
## [21] "SDOH High Risk Domains"                                                                                                            
## [22] "Zip code"                                                                                                                          
## [23] "Prescreen Contacted Y/N"                                                                                                           
## [24] "PS \"In the past 12 months, were you ever worried your food would run out before you had money to buy more?\""                     
## [25] "PS \"In the past 12 months, did the food you could afford not last until the end of the month, and you couldn’t get more?\""       
## [26] "PS \"In the last month, did anyone in your household have to skip meals due to lack of food?\""                                    
## [27] "(Pre Screen) Interested in Program?"                                                                                               
## [28] "T0 Contact"                                                                                                                        
## [29] "(Screen for Enrollment) Interested in Program"                                                                                     
## [30] "T0- Are you likely to eat the food as part of your normal daily routine?"                                                          
## [31] "T0- In the past week, how often did you check your blood sugar?"                                                                   
## [32] "T0- Are you Confident in making food choices that help control your blood sugar?"                                                  
## [33] "T0- Are you confident in understanding how to read food labels and nutrition information?"                                         
## [34] "T0- Were your sugar levels usually in your target range last week?"                                                                
## [35] "T0- In the past week, did you miss any doses of you diabetes medications?"                                                         
## [36] "T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?"             
## [37] "T1 Contact (Y/N)"                                                                                                                  
## [38] "T1 NOTES"                                                                                                                          
## [39] "T1- In the past week, how many meals did you make using food from the program?"                                                    
## [40] "T1-Did you throw away any of the food from your most recent delivery?"                                                             
## [41] "T1- In the past week, how easy was it to use the food in your meals?"                                                              
## [42] "T1- In the past week, how often did you check your blood sugar?"                                                                   
## [43] "T1- Were your sugar levels usually in your target range last week?"                                                                
## [44] "T1-Did you have any symptoms of high or low blood sugar in the past week?"                                                         
## [45] "T1- In the past week, did you miss any doses of you diabetes medications?"                                                         
## [46] "T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?"                            
## [47] "T1-Have you done anything in the past week to help manage your diabetes — like walking, portion control, or drinking more water?  "
## [48] "T1-How much did this program help you feel more in control of your diabetes?  "                                                    
## [49] "T1-Did the nutrition material you got from White Plains Hospital help you make healthier food choices?"                            
## [50] "T1- How Confident are you in making food choices that help control your blood sugar?"                                              
## [51] "T1-How confident are you in understanding food labels or nutrition information?  "                                                 
## [52] "T1-How confident are you in your ability to prepare healthy meals with the food you typically have at home?  "                     
## [53] "T1-Do you have any suggestions or feedback to improve this program for others?  "                                                  
## [54] "T2 Contact (Y/N)"                                                                                                                  
## [55] "T2 NOTES"                                                                                                                          
## [56] "T2- In the past week, how many meals did you make using food from the program?"                                                    
## [57] "T2-Did you throw away any of the food from your most recent delivery?"                                                             
## [58] "T2- In the past week, how easy was it to use the food in your meals?"                                                              
## [59] "T2- In the past week, how often did you check your blood sugar?"                                                                   
## [60] "T2- Were your sugar levels usually in your target range last week?"                                                                
## [61] "T2-In the past week, did you have any symptoms of high or low blood sugar?"                                                        
## [62] "T2- In the past week, did you miss any doses of you diabetes medications?"                                                         
## [63] "T2- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?"                            
## [64] "T2-Have you done anything in the past week to help manage your diabetes — like walking, portion control, or drinking more water?  "
## [65] "T2-How much did this program help you feel more in control of your diabetes?  "                                                    
## [66] "T2-Did the nutrition material you got from White Plains Hospital help you make healthier food choices?"                            
## [67] "T2- How Confident are you in making food choices that help control your blood sugar?"                                              
## [68] "T2-How confident are you in understanding food labels or nutrition information?  "                                                 
## [69] "T2-How confident are you in your ability to prepare healthy meals with the food you typically have at home?  "                     
## [70] "T2-Do you have any suggestions or feedback to improve this program for others?  "                                                  
## [71] "Enrolled"                                                                                                                          
## [72] "Cohort"                                                                                                                            
## [73] "Delivery 1"                                                                                                                        
## [74] "D1- Comments"                                                                                                                      
## [75] "Delivery 2"                                                                                                                        
## [76] "02- Comments"                                                                                                                      
## [77] "Delivery 3"                                                                                                                        
## [78] "03- Comments"                                                                                                                      
## [79] "Delivery 4"                                                                                                                        
## [80] "Hemoglobin A1c (8/1/25-10/2/2025)"                                                                                                 
## [81] "Hospital Utilization (8/1/25-10/2/25)"                                                                                             
## [82] "A1c Change"                                                                                                                        
## [83] "A1c Under 8?"                                                                                                                      
## [84] "Hemoglobin A1c (T1)"                                                                                                               
## [85] "Hospital Utilization (T1)"                                                                                                         
## [86] "A1c Change (T1)"                                                                                                                   
## [87] "A1c Under 8? (T1)"                                                                                                                 
## [88] "additional feedback"
survey_cols <- grep("^T[012]-", names(df), value = TRUE)

na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")

df <- df %>%
  mutate(
    across(
      all_of(survey_cols),
      ~ {
        x <- trimws(as.character(.x))
        x[x %in% na_codes] <- NA_character_
        x
      }
    )
  )

# T0 survey questions (optionally drop Contact / NOTES)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t0_q_cols <- t0_q_cols[!grepl("Contact|NOTES", t0_q_cols, ignore.case = TRUE)]

# T1 survey questions (optionally drop Contact / NOTES)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)
t1_q_cols <- t1_q_cols[!grepl("Contact|NOTES", t1_q_cols, ignore.case = TRUE)]
###Creating Cohort 1 -- all patients with both T0 and T1 data
df_COHORT1 <- df %>%
  filter(
    if_all(all_of(t0_q_cols), ~ !is.na(.x)),
    if_all(all_of(t1_q_cols), ~ !is.na(.x))
  ) %>%
  mutate(
    Age = Age %>%
      as.character() %>%
      trimws() %>%
      na_if("") %>%
      na_if("NA") %>%
      as.numeric()
  )
###COHORT1 Demographics -- age
### Compute boxplot stats manually
min(df_COHORT1$Age, na.rm = TRUE)
## [1] 23
max(df_COHORT1$Age, na.rm = TRUE)
## [1] 91
age_bp <- boxplot.stats(df_COHORT1$Age)
boxplot.stats(df_COHORT1$Age)$stats
## [1] 27 50 60 66 79
age_stats <- data.frame(
  stat = c("Min", "Q1", "Median", "Q3", "Max"),
  value = round(age_bp$stats, 1)
)
age_stats
##     stat value
## 1    Min    27
## 2     Q1    50
## 3 Median    60
## 4     Q3    66
## 5    Max    79
ggplot(df_COHORT1, aes(x = Age)) +
  geom_histogram(
    binwidth = 5,
    boundary = 0,
    fill = "#5DADE2",
    color = "white",
    alpha = 0.9
  ) +
  
  # Rug marks
  geom_rug(
    sides = "b",
    alpha = 0.7,
    color = "#1F618D",
    length = unit(0.05, "npc")
  ) +
  
  # Mean + median
  geom_vline(aes(xintercept = mean(Age, na.rm = TRUE)),
             color = "#C0392B", size = 1.1, linetype = "dashed") +
  geom_vline(aes(xintercept = median(Age, na.rm = TRUE)),
             color = "#27AE60", size = 1.1, linetype = "dashed") +
  
  # Labels for mean/median
  annotate("text", 
           x = mean(df_COHORT1$Age, na.rm = TRUE),
           y = 0.5,
           label = paste0("Mean: ", round(mean(df_COHORT1$Age, na.rm = TRUE), 1)),
           color = "#C0392B",
           hjust = -0.1,
           size = 4) +
  annotate("text", 
           x = median(df_COHORT1$Age, na.rm = TRUE),
           y = 1.2,
           label = paste0("Median: ", round(median(df_COHORT1$Age, na.rm = TRUE), 1)),
           color = "#27AE60",
           hjust = 1.1,
           size = 4) +
  
  # X-axis ticks every 10 years
  scale_x_continuous(
    breaks = seq(floor(min(df_COHORT1$Age, na.rm = TRUE) / 10) * 10,
                 ceiling(max(df_COHORT1$Age, na.rm = TRUE) / 10) * 10,
                 by = 10)
  ) +
  
  labs(
    title = "Age Distribution – Cohort 1",
    subtitle = "Histogram shows the number of participants in each 5-year age range.\nDashed lines show mean (red) and median (green).",
    x = "Age (years)",
    y = "Number of participants"
  ) +
  
  theme_bw() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 11),
    axis.text = element_text(size = 11),
    axis.title = element_text(size = 12)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

###COHORT1 Demographics -- sex
#SEX
sex_summary <- df_COHORT1 %>%
  count(`Legal Sex`) %>%
  mutate(percent = round(100 * n / sum(n), 1))
sex_summary
## # A tibble: 2 × 3
##   `Legal Sex`     n percent
##   <chr>       <int>   <dbl>
## 1 Female         19    57.6
## 2 Male           14    42.4
ggplot(sex_summary, aes(x = "", y = percent, fill = `Legal Sex`)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_label(
    aes(label = paste0(percent, "%")),
    color = "black",
    position = position_stack(vjust = 0.5),
    show.legend = FALSE
  ) +
  labs(
    title = "Legal Sex Distribution – Cohort 1",
    fill = "Legal Sex"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    legend.position = "right"
  )

#RACE
race_df <- df_COHORT1 %>% select(Race)

split_race_base <- function(x) {
  locs <- gregexpr("R[0-9]+", x)[[1]]
  if (locs[1] == -1) return(NA_character_)
  race_list <- character(length(locs))
  for (i in seq_along(locs)) {
    start <- locs[i]
    end <- if (i < length(locs)) locs[i+1] - 1 else nchar(x)
    race_list[i] <- trimws(substr(x, start, end))
  }
  race_list
}
race_expanded <- race_df %>%
  mutate(Race = lapply(Race, split_race_base)) %>%
  unnest(Race)

race_summary <- race_expanded %>%
  filter(!is.na(Race)) %>%
  count(Race) %>%
  mutate(percent = round(100 * n / sum(n), 1))
race_summary
## # A tibble: 5 × 3
##   Race                                    n percent
##   <chr>                               <int>   <dbl>
## 1 R1 American Indian or Alaska Native     1     2.7
## 2 R3 Black or African-American           14    37.8
## 3 R4 Native Hawaiian                      1     2.7
## 4 R5 White                                4    10.8
## 5 R9 Other                               17    45.9
ggplot(race_summary, aes(x = reorder(Race, percent), y = percent)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  geom_text(aes(label = paste0(percent, "%")),
            hjust = -0.1, size = 4) +
  labs(
    title = "Race Distribution – Cohort 1",
    x = "Race Category",
    y = "Percentage of Sample (%)"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 10)
  ) +
  ylim(0, max(race_summary$percent) + 5)

### ETHNICITY – Cohort 1
# Clean and summarize without splitting into 'E' fragments

# 1. Pull raw column
eth_raw <- df_COHORT1$Ethnicity

# 2. Remove bracketed indices like [1], [2]
eth_clean <- gsub("\\[[^]]*\\]", "", eth_raw)

# 3. Trim whitespace
eth_clean <- trimws(eth_clean)

# 4. Drop blanks / NAs
eth_clean <- eth_clean[!is.na(eth_clean) & eth_clean != ""]

# 5. Map codes to readable labels
eth_label <- ifelse(
  grepl("E1", eth_clean), "Spanish/Hispanic/Latino",
  ifelse(grepl("E2", eth_clean), "Not Spanish/Hispanic/Latino", NA)
)

# 6. Drop any residual NA (just in case)
eth_label <- eth_label[!is.na(eth_label)]

# 7. Summarize counts and percentages
tab_eth <- table(eth_label)

eth_summary <- data.frame(
  Ethnicity = names(tab_eth),
  n         = as.vector(tab_eth)
)
eth_summary$percent <- round(100 * eth_summary$n / sum(eth_summary$n), 1)
eth_summary
##                     Ethnicity  n percent
## 1 Not Spanish/Hispanic/Latino 17    51.5
## 2     Spanish/Hispanic/Latino 16    48.5
ggplot(eth_summary, aes(x = reorder(Ethnicity, percent), y = percent)) +
  geom_col() +
  coord_flip() +
  theme_bw() +
  labs(
    title = "Ethnicity Distribution – Cohort 1",
    x     = "Ethnicity",
    y     = "Percentage of sample (%)"
  ) +
  theme(
    axis.text.y = element_text(size = 10),
    plot.title  = element_text(size = 14, face = "bold")
  )

sort(unique(eth_clean))
## [1] "E1 Spanish/Hispanic/Latino"     "E2 Not Spanish/Hispanic/Latino"
#A1C
df_COHORT1 <- df_COHORT1 %>%
  mutate(
    HbA1c = as.numeric(`HbA1c Value (Last 3 Months)`)
  )

hba1c_summary <- df_COHORT1 %>%
  summarize(
    n        = sum(!is.na(HbA1c)),
    mean     = mean(HbA1c, na.rm = TRUE),
    sd       = sd(HbA1c, na.rm = TRUE),
    min      = min(HbA1c, na.rm = TRUE),
    max      = max(HbA1c, na.rm = TRUE)
  )
hba1c_summary
## # A tibble: 1 × 5
##       n  mean    sd   min   max
##   <int> <dbl> <dbl> <dbl> <dbl>
## 1    33  10.7  1.73     8  14.7
# Compute boxplot stats manually# Compute boxplot stats manually
bp <- boxplot.stats(df_COHORT1$HbA1c)

stats_df <- data.frame(
  stat = c("Min", "Q1", "Median", "Q3", "Max"),
  value = round(c(bp$stats[1], bp$stats[2], bp$stats[3], bp$stats[4], bp$stats[5]), 2)
)

ggplot(df_COHORT1, aes(x = "", y = HbA1c)) +
  geom_boxplot(fill = "#AED6F1", width = 0.3, outlier.shape = NA) +

  # Add red points at each statistic
  geom_point(data = stats_df,
             aes(x = "", y = value),
             color = "red", size = 3) +

  # Label each point with the HbA1c value
  geom_text(data = stats_df,
            aes(x = "", y = value, label = paste0(stat, ": ", value)),
            nudge_x = 0.25, size = 3.5, hjust = 0) +

  theme_bw() +
  labs(
    title = "HbA1c Boxplot with Statistical Markers — Cohort 1",
    x = "",
    y = "HbA1c (%)"
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 16, face = "bold")
  ) +
  coord_cartesian(clip = "off")   # Allow labels to extend outside plot area

###Blood sugar check
## 1. Identify the two survey columns
t0_col <- grep("^T0- In the past week, how often did you check your blood sugar",
               names(df_COHORT1), value = TRUE)
t1_col <- grep("^T1- In the past week, how often did you check your blood sugar",
               names(df_COHORT1), value = TRUE)

t0_raw <- df_COHORT1[[t0_col]]
t1_raw <- df_COHORT1[[t1_col]]

## 2. Cleaning function: only collapses clearly-equivalent strings
clean_freq <- function(x) {
  x_std <- toupper(trimws(x))

  dplyr::case_when(
    # 1–2 times (all variants)
    x_std %in% c("1–2 TIMES", "1-2 TIMES", "1 TO 2 TIMES", "1 TO 2 TIMES") ~ "1–2 times",

    # 3–4 times
    x_std %in% c("3–4 TIMES", "3-4 TIMES", "3–4 TIMES") ~ "3–4 times",

    # Daily
    x_std %in% c("DAILY", "DAILY ", "DAILY", "DAILY") ~ "Daily",

    # More than once daily
    x_std %in% c("MORE THAN DAILY",
                 "MORE THAN ONCE DAILY",
                 "MORE THAN ONCE  DAILY",
                 "MORE THAN ONCE A DAY") ~ "More than once daily",

    # Not at all
    x_std %in% c("NOT AT ALL") ~ "Not at all",

    # Anything unexpected gets labeled as "Other (check)"
    TRUE ~ "Other (check)"
  )
}

## 3. Build long data frame with BOTH raw and cleaned responses
df_bs <- data.frame(
  Timepoint     = c(rep("T0", length(t0_raw)), rep("T1", length(t1_raw))),
  Response_raw  = c(t0_raw, t1_raw),
  Response_clean = clean_freq(c(t0_raw, t1_raw)),
  stringsAsFactors = FALSE
)

## 4. Quick audit: raw → clean mapping (so you can verify no surprises)
table(df_bs$Response_raw, df_bs$Response_clean)
##                        
##                         1–2 times 3–4 times Daily More than once daily
##   1 to 2 TIMES                  1         0     0                    0
##   1-2 times                     2         0     0                    0
##   1-2 TIMES                     4         0     0                    0
##   3-4 times                     0         4     0                    0
##   3-4 TIMES                     0         1     0                    0
##   daily                         0         0     5                    0
##   Daily                         0         0     9                    0
##   DAILY                         0         0     7                    0
##   MORE THAN DAILY               0         0     0                    3
##   MORE THAN ONCE  DAILY         0         0     0                    3
##   MORE THAN ONCE A DAY          0         0     0                    5
##   More than once daily          0         0     0                   18
##   Not at all                    0         0     0                    0
##   NOT AT ALL                    0         0     0                    0
##                        
##                         Not at all
##   1 to 2 TIMES                   0
##   1-2 times                      0
##   1-2 TIMES                      0
##   3-4 times                      0
##   3-4 TIMES                      0
##   daily                          0
##   Daily                          0
##   DAILY                          0
##   MORE THAN DAILY                0
##   MORE THAN ONCE  DAILY          0
##   MORE THAN ONCE A DAY           0
##   More than once daily           0
##   Not at all                     2
##   NOT AT ALL                     2
## 5. Plot using the cleaned categories
df_bs$Response_clean <- factor(
  df_bs$Response_clean,
  levels = c("Not at all",
             "1–2 times",
             "3–4 times",
             "Daily",
             "More than once daily")   # we dropped "Other (check)" since none left
)

ggplot(df_bs, aes(x = Response_clean, fill = Timepoint)) +
  geom_bar(position = "dodge") +       # geom_bar uses after_stat(count) internally
  theme_bw() +
  labs(
    title = "Blood Sugar Monitoring Frequency (T0 vs T1)",
    x = "Monitoring frequency (grouped)",
    y = "Count"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
    plot.title  = element_text(size = 15, face = "bold")
  )

#####. Blood sugar in target range at T0 vs T1
sugar_T0_raw <- df_COHORT1$`T0- Were your sugar levels usually in your target range last week?`
sugar_T1_raw <- df_COHORT1$`T1- Were your sugar levels usually in your target range last week?`

sugar_T0_std <- toupper(trimws(sugar_T0_raw))

sugar_T0_clean <- case_when(
  sugar_T0_std == "YES"   ~ "Yes",
  sugar_T0_std == "NO"    ~ "No",
  sugar_T0_std == "MAYBE" ~ "Maybe",
  TRUE ~ NA_character_
)

table(sugar_T0_clean, useNA = "ifany")
## sugar_T0_clean
## Maybe    No   Yes 
##    11    11    11
# Standardize all raw responses once
sugar_T1_std <- toupper(trimws(as.character(sugar_T1_raw)))

# Clean mapping
sugar_T1_clean <- case_when(
  sugar_T1_std %in% c("NEVER", "N") ~ "Never",

  sugar_T1_std == "SOMETIMES" ~ "Sometimes",

  sugar_T1_std %in% c("ABOUT HALF THE TIME",
                      "HALF THE TIME") ~ "About half the time",

  sugar_T1_std %in% c("MORE THAN HALF THE TIME",
                      "MOST OF THE TIME",
                      "MOST OF TIME",
                      "YES", "Y") ~ "Most of the time",

  sugar_T1_std == "ALWAYS" ~ "Always",

  TRUE ~ NA_character_
)

table_clean <- table(sugar_T1_clean, useNA="ifany")
table_clean
## sugar_T1_clean
## About half the time              Always    Most of the time               Never 
##                   2                   2                  17                   3 
##           Sometimes 
##                   9
table(sugar_T1_clean, useNA = "ifany")
## sugar_T1_clean
## About half the time              Always    Most of the time               Never 
##                   2                   2                  17                   3 
##           Sometimes 
##                   9
df_sugar_T0 <- data.frame(
  Timepoint = "T0",
  Response  = factor(sugar_T0_clean,
                     levels = c("No", "Maybe", "Yes"))
)

df_sugar_T1 <- data.frame(
  Timepoint = "T1",
  Response  = factor(sugar_T1_clean,
                     levels = c("Never",
                                "Sometimes",
                                "About half the time",
                                "Most of the time",
                                "Always"))
)

df_sugar_long <- rbind(df_sugar_T0, df_sugar_T1)

df_counts <- df_sugar_long %>%
  count(Timepoint, Response)

ggplot(df_counts, aes(x = Response, y = n)) +
  geom_col(fill = "#2E86C1") +
  geom_text(aes(label = n), vjust = -0.3, size = 4) +
  facet_wrap(~ Timepoint, scales = "free_x") +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    strip.text  = element_text(size = 14, face = "bold")
  ) +
  labs(
    title = "Frequency of Blood Sugar in Target Range Last Week (T0 vs T1) – Cohort 1",
    x     = "Response",
    y     = "Count"
  )

###FOLLOW UP APPT
appt_T0_raw <- df_COHORT1$`T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?`
appt_T1_raw <- df_COHORT1$`T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?`

cleanYN <- function(x) {
  out <- toupper(trimws(x))

  dplyr::case_when(
    out %in% c("YES", "Y", "YS") ~ "Yes",   # include typo “ys”
    out %in% c("NO", "N")        ~ "No",
    TRUE ~ NA_character_
  )
}

appt_T0_clean <- cleanYN(appt_T0_raw)
appt_T1_clean <- cleanYN(appt_T1_raw)

sort(unique(appt_T0_clean))
## [1] "No"  "Yes"
sort(unique(appt_T1_clean))
## [1] "No"  "Yes"
table(appt_T0_clean, useNA="ifany")
## appt_T0_clean
##   No  Yes <NA> 
##    6   26    1
table(appt_T1_clean, useNA="ifany")
## appt_T1_clean
##  No Yes 
##  10  23
df_appt_T0 <- data.frame(
  Timepoint = "T0",
  Response  = appt_T0_clean
)

df_appt_T1 <- data.frame(
  Timepoint = "T1",
  Response  = appt_T1_clean
)

df_appt_long <- rbind(df_appt_T0, df_appt_T1)

df_appt_long <- df_appt_long[!is.na(df_appt_long$Response), ]


df_appt_long$Response <- factor(df_appt_long$Response,
                                levels = c("No", "Yes"))

ggplot(df_appt_long, aes(x = Timepoint, fill = Response)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(
    name = "Appointment Completed?",
    values = c("No" = "#d73027",   # red
               "Yes" = "#2E86C1"), # blue
    breaks = c("No", "Yes")        # ensures No listed first in legend
  ) +
  theme_bw() +
  labs(
    title = "Appointment with Diabetes Provider (T0 vs T1) – Cohort 1",
    x     = "Timepoint",
    y     = "Count"
  ) +
  theme(
    axis.text.x = element_text(size = 12),
    plot.title  = element_text(size = 14, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text  = element_text(size = 11)
  )

###MEDICATION ADHERENCE
miss_T0_raw <- df_COHORT1$`T0- In the past week, did you miss any doses of you diabetes medications?`
miss_T1_raw <- df_COHORT1$`T1- In the past week, did you miss any doses of you diabetes medications?`

miss_T0_clean <- cleanYN(miss_T0_raw)
miss_T1_clean <- cleanYN(miss_T1_raw)

table(miss_T0_clean, useNA = "ifany")
## miss_T0_clean
##  No Yes 
##  26   7
table(miss_T1_clean, useNA = "ifany")
## miss_T1_clean
##  No Yes 
##  27   6
df_miss_T0 <- data.frame(Timepoint = "T0", Response = miss_T0_clean)
df_miss_T1 <- data.frame(Timepoint = "T1", Response = miss_T1_clean)

df_miss_long <- rbind(df_miss_T0, df_miss_T1) |>
  dplyr::filter(!is.na(Response))

df_miss_long$Response <- factor(df_miss_long$Response,
                                levels = c("No", "Yes"))

ggplot(df_miss_long, aes(x = Timepoint, fill = Response)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(
    name = "Missed Doses?",
    values = c("Yes" = "#d73027",    # red
               "No" = "#2E86C1"),  # blue
    breaks = c("No", "Yes")         # ensures legend order
  ) +
  theme_bw() +
  labs(
    title = "Missed Diabetes Medications (T0 vs T1) – Cohort 1",
    x = "Timepoint",
    y = "Count"
  ) +
  theme(
    axis.text.x  = element_text(size = 12),
    plot.title   = element_text(size = 14, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text  = element_text(size = 11)
  )

## 1. Pull the 3 columns safely -----------------------------
conf_T0_col <- grep("^T0- Are you confident in understanding how to read food labels",
                    names(df_COHORT1), value = TRUE)
conf_T1_col <- grep("^T1-How confident are you in understanding food labels",
                    names(df_COHORT1), value = TRUE)
conf_T2_col <- grep("^T2-How confident are you in understanding food labels",
                    names(df_COHORT1), value = TRUE)

conf_labels_T0_raw <- df_COHORT1[[conf_T0_col]]
conf_labels_T1_raw <- df_COHORT1[[conf_T1_col]]
conf_labels_T2_raw <- df_COHORT1[[conf_T2_col]]
## 2. Clean responses ---------------------------------------

# T0: Yes / No / Maybe
conf_labels_T0 <- dplyr::case_when(
  toupper(trimws(conf_labels_T0_raw)) == "YES"   ~ "Yes",
  toupper(trimws(conf_labels_T0_raw)) == "NO"    ~ "No",
  toupper(trimws(conf_labels_T0_raw)) == "MAYBE" ~ "Maybe",
  TRUE ~ NA_character_
)

# T1/T2: 1–4 scale or text versions of same idea
recode_conf_1to4 <- function(x) {
  x_std <- toupper(trimws(x))
  dplyr::case_when(
    x_std %in% c("1", "NOT CONFIDENT")                     ~ "Not confident",
    x_std %in% c("2", "SOMEWHAT NOT CONFIDENT")           ~ "Somewhat not confident",
    x_std %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ "Confident",
    x_std %in% c("4", "VERY CONFIDENT")                   ~ "Very confident",
    TRUE ~ NA_character_
  )
}

conf_labels_T1 <- recode_conf_1to4(conf_labels_T1_raw)
conf_labels_T2 <- recode_conf_1to4(conf_labels_T2_raw)
## 3. Long format -------------------------------------------

df_conf_labels_long <- data.frame(
  Timepoint = rep(c("T0", "T1", "T2"), each = nrow(df_COHORT1)),
  Response  = c(conf_labels_T0, conf_labels_T1, conf_labels_T2),
  stringsAsFactors = FALSE
)

df_conf_labels_long <- df_conf_labels_long[!is.na(df_conf_labels_long$Response), ]

df_conf_labels_long$Response <- factor(
  df_conf_labels_long$Response,
  levels = c("Not confident",
             "Somewhat not confident",
             "Maybe",
             "No",
             "Yes",
             "Confident",
             "Very confident")
)
# ----------------------------
# 1. Define groups for T0 vs T1
# ----------------------------
t0_levels <- c("No", "Maybe", "Yes")

t1_levels <- c(
  "Not confident",
  "Somewhat not confident",
  "Confident",
  "Very confident"
)

# ----------------------------
# 2. Filter each panel separately
# ----------------------------
df_conf_T0 <- df_conf_labels_long %>%
  filter(Timepoint == "T0", Response %in% t0_levels) %>%
  mutate(Response = factor(Response, levels = t0_levels))

df_conf_T1 <- df_conf_labels_long %>%
  filter(Timepoint == "T1", Response %in% t1_levels) %>%
  mutate(Response = factor(Response, levels = t1_levels))

df_conf_filtered <- bind_rows(df_conf_T0, df_conf_T1)

# ----------------------------
# 3. Plot
# ----------------------------
ggplot(df_conf_filtered, aes(x = Response)) +
  geom_bar(fill = "#2E86C1") +
  facet_wrap(~ Timepoint, nrow = 1, scales = "free_x") +
  theme_bw() +
  labs(
    title = "Confidence Reading Food Labels (T0 vs T1)",
    x = "Response",
    y = "Count of Participants"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    strip.text  = element_text(size = 14, face = "bold"),
    plot.title  = element_text(size = 14, face = "bold"),
    legend.position = "none"
  )

## Find the T1 "control" question column by pattern## Pull the column
ctrl_col <- grep("^T1.?How much did this program help you feel more in control of your diabetes",
                 names(df_COHORT1), value = TRUE)

help_raw <- df_COHORT1[[ctrl_col]]

## Standardize
help_std <- toupper(trimws(as.character(help_raw)))

## Recode (NEW RULES: 4 = A little, 5 = A lot)
help_clean <- rep(NA_character_, length(help_std))

help_clean[help_std %in% c("A LITTLE", "A LITTLE ", "4")] <- "A little"
help_clean[help_std %in% c("SOMEWHAT", "2")]              <- "Somewhat"
help_clean[help_std %in% c("A LOT", "A LOT ", "5")]       <- "A lot"

## Ordered factor
help_clean <- factor(help_clean,
                     levels = c("A little", "Somewhat", "A lot"))

## Summary table
help_tab <- data.frame(table(help_clean), stringsAsFactors = FALSE)
names(help_tab) <- c("Response", "n")
help_tab <- help_tab[!is.na(help_tab$Response), ]
help_tab$percent <- round(100 * help_tab$n / sum(help_tab$n), 1)


ggplot(help_tab, aes(x = Response, y = percent)) +
  geom_col(fill = "#2E86C1") +
  geom_text(aes(label = paste0(percent, "%")),
            vjust = -0.3, size = 4) +
  scale_y_continuous(
    limits = c(0, max(help_tab$percent) + 10),
    breaks = seq(0, 100, by = 10),
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Program impact on perceived diabetes control (T1)",
    x     = "",
    y     = "% of participants"
  ) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text  = element_text(size = 11),
    axis.title = element_text(size = 11)
  )

## ---- Clean Yes/No responses ----
## 1. Grab the T1 column by pattern
nut_col_T1 <- grep("^T1-Did the nutrition material you got from White Plains Hospital",
                   names(df_COHORT1), value = TRUE)

nut_col_T1
## [1] "T1-Did the nutrition material you got from White Plains Hospital help you make healthier food choices?"
# should print the single T1 column name

## 2. Pull raw values
nut_raw <- df_COHORT1[[nut_col_T1]]

## 3. Standardize and recode YES/NO/Y/N
nut_std <- toupper(trimws(nut_raw))

nut_clean <- dplyr::case_when(
  nut_std %in% c("YES", "Y") ~ "Yes",
  nut_std %in% c("NO",  "N") ~ "No",
  TRUE                       ~ NA_character_
)

nut_clean <- factor(nut_clean, levels = c("No", "Yes"))

## 4. Table with percents
nut_tab <- data.frame(table(nut_clean), stringsAsFactors = FALSE)
names(nut_tab) <- c("Response", "n")
nut_tab <- nut_tab[!is.na(nut_tab$Response), ]
nut_tab$percent <- round(100 * nut_tab$n / sum(nut_tab$n), 1)

nut_tab
##   Response  n percent
## 1       No  3     9.1
## 2      Yes 30    90.9
## ---- Plot: % Yes / No ----
ggplot(nut_tab, aes(x = Response, y = percent, fill = Response)) +
  geom_col() +
  geom_text(aes(label = paste0(percent, "%")),
            vjust = -0.3, size = 3.8) +
  scale_fill_manual(
    values = c(
      "No"  = "#d73027",   # red
      "Yes" = "#27AE60"    # green
    )
  ) +
  scale_y_continuous(
    limits = c(0, max(nut_tab$percent) + 10),
    breaks = seq(0, 100, by = 10),
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Did Nutrition Material Help You Make Healthier Food Choices? (T1)",
    x     = "",
    y     = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    axis.title  = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold"),
    legend.position = "none"   # optional: remove legend
  )