df <- readxl::read_excel(“DM.xlsx”) df <- df[-c(62:67), ] # drop rows 62–67 names(df) <- trimws(names(df))# trim header spaces options(dplyr.summarise.inform = FALSE)
install.packages(c( “readxl”, # Excel import “dplyr”, # data manipulation “tidyr”, # reshaping “ggplot2”, # plotting “knitr”, # tables / Rmd “kableExtra”,# nicer tables “janitor” # clean_names(), tabyl() ))
library(readxl) library(dplyr)
# Quick checks
dim(df) # rows and columns
## NULL
names(df) # column names
## NULL
head(df) # first few rows
##
## 1 function (x, df1, df2, ncp, log = FALSE)
## 2 {
## 3 if (missing(ncp))
## 4 .Call(C_df, x, df1, df2, log)
## 5 else .Call(C_dnf, x, df1, df2, ncp, log)
## 6 }
library(readxl) library(ggplot2)
ggplot(df, aes(x = Age)) + geom_histogram(binwidth = 5, fill = “steelblue”, color = “white”) + labs(title = “Age Distribution”, x = “Age (years)”, y = “Count”)
ggplot(df, aes(x = Legal Sex)) + geom_bar(fill =
“darkorange”) + labs(title = “Gender (Legal Sex)”, x = “Legal Sex”, y =
“Count”)
ggplot(df, aes(x = Race)) + geom_bar(fill = “purple”) + labs(title = “Race”, x = “Race”, y = “Count”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df, aes(x = Ethnicity)) + geom_bar(fill = “seagreen”) + labs(title = “Ethnicity”, x = “Ethnicity”, y = “Count”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(dplyr)
ggplot(df, aes(x = HbA1c Value (Last 3 Months))) +
geom_histogram(binwidth = 0.5, fill = “tomato”, color = “white”) +
labs(title = “HbA1c Distribution”, x = “HbA1c (%)”, y = “Count”)
age_table <- df %>% summarise( count = sum(!is.na(Age)), mean_age = mean(Age, na.rm = TRUE), median_age = median(Age, na.rm = TRUE), min_age = min(Age, na.rm = TRUE), max_age = max(Age, na.rm = TRUE) )
gender_table <- df %>% count(Legal Sex, name =
“n”) %>% mutate(percent = round(100 * n / sum(n), 1))
race_table <- df %>% count(Race, name = “n”) %>% mutate(percent = round(100 * n / sum(n), 1))
ethnicity_table <- df %>% count(Ethnicity, name = “n”) %>% mutate(percent = round(100 * n / sum(n), 1))
a1c_table <- df %>% summarise( median_A1c =
median(HbA1c Value (Last 3 Months), na.rm = TRUE), min_A1c
= min(HbA1c Value (Last 3 Months), na.rm = TRUE), max_A1c =
max(HbA1c Value (Last 3 Months), na.rm = TRUE) )
age_table gender_table race_table ethnicity_table a1c_table
names(df) <- trimws(names(df))
find1 <- grep(“worried.run out”, names(df), ignore.case = TRUE) find2 <- grep(”could.afford.not last|couldn’?t\sget more”, names(df), ignore.case = TRUE) find3 <- grep(“last month.*skip meals|skip meals”, names(df), ignore.case = TRUE)
stopifnot(length(find1) >= 1, length(find2) >= 1, length(find3) >= 1)
c(names(df)[find1[1]], names(df)[find2[1]], names(df)[find3[1]]) # print what was matched
names(df)[c(find1[1], find2[1], find3[1])] <- c(“VS1”,“VS2”,“VS3”)
norm <- function(x) trimws(toupper(as.character(x))) yes_matrix <- data.frame( VS1 = norm(df\(VS1) %in% c("YES","Y","SI","SÍ"), VS2 = norm(df\)VS2) %in% c(“YES”,“Y”,“SI”,“SÍ”), VS3 = norm(df$VS3) %in% c(“YES”,“Y”,“SI”,“SÍ”) )
yes_count <- rowSums(yes_matrix, na.rm = TRUE) table_yes <- table(yes_count) percent_yes <- round(100 * prop.table(table_yes), 1)
table_yes percent_yes
yes_patterns <- apply(yes_matrix, 1, function(x) { cols <- names(yes_matrix)[x] if (length(cols) == 0) “None” else paste(cols, collapse = “,”) }) pattern_table <- sort(table(yes_patterns[yes_count > 0]), decreasing = TRUE) pattern_table
if (!requireNamespace(“readxl”, quietly = TRUE)) install.packages(“readxl”) if (!requireNamespace(“dplyr”, quietly = TRUE)) install.packages(“dplyr”) if (!requireNamespace(“tidyr”, quietly = TRUE)) install.packages(“tidyr”) if (!requireNamespace(“ggplot2”, quietly = TRUE)) install.packages(“ggplot2”) if (!requireNamespace(“scales”, quietly = TRUE)) install.packages(“scales”)
library(readxl); library(dplyr); library(tidyr); library(ggplot2); library(scales)
find_col <- function(pattern, nms = names(df)) { hits <- grep(pattern, nms, ignore.case = TRUE, value = TRUE) if (length(hits) == 0) stop(“No column matched:”, pattern) hits[1] }
percent_format_1 <- function(x) scales::percent(x, accuracy = 1)
glucose_cols <- c( find_col(“^T0-\sIn the past week.check.blood sugar”), find_col(”^T1-\sIn the past week.check.blood sugar”), find_col(“^T2-\sIn the past week.check.*blood sugar”) )
glucose_long <- df %>% select(all_of(glucose_cols)) %>% setNames(c(“T0”,“T1”,“T2”)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“Frequency”)
p_glucose <- ggplot(glucose_long, aes(Timepoint, fill = Frequency)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Glucose Monitoring Frequency Over Time”, y = “% of participants”, x = NULL, fill = “Frequency”) p_glucose
glycemic_cols <- c( find_col(“^T0-\sWere your sugar levels.target range”), find_col(“^T1-\sWere your sugar levels.target range”), find_col(“^T2-\sWere your sugar levels.target range”) )
glycemic_long <- df %>% select(all_of(glycemic_cols)) %>% setNames(c(“T0”,“T1”,“T2”)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“TargetInRange”)
p_glycemic <- ggplot(glycemic_long, aes(Timepoint, fill = TargetInRange)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Self-Reported In-Target Glucose (T0→T2)”, y = “% of participants”, x = NULL, fill = “Response”) p_glycemic
miss_cols <- c( find_col(“^T0-\sIn the past week.miss any doses.diabetes medication”), find_col(”^T1-\sIn the past week.miss any doses.diabetes medication”), find_col(“^T2-\sIn the past week.miss any doses.*diabetes medication”) )
miss_long <- df %>% select(all_of(miss_cols)) %>% setNames(c(“T0”,“T1”,“T2”)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“MissedDose”)
p_miss <- ggplot(miss_long, aes(Timepoint, fill = MissedDose)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Missed Diabetes Medication Doses (T0→T2)”, y = “% of participants”, x = NULL, fill = “Response”) p_miss
food_conf_cols <- c( find_col(“^T0-\sAre you.Confident.making food choices.control.blood sugar”), find_col(”^T1-\sAre you.Confident.making food choices.control.blood sugar”), find_col(“^T2-\sAre you.Confident.making food choices.control.*blood sugar”) )
food_conf_long <- df %>% select(all_of(food_conf_cols)) %>% setNames(c(“T0”,“T1”,“T2”)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“Confidence”)
p_food_conf <- ggplot(food_conf_long, aes(Timepoint, fill = Confidence)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Confidence in Food Choices (T0→T2)”, y = “% of participants”, x = NULL, fill = “Response”) p_food_conf
label_conf_cols <- c( find_col(“^T0-\sAre you.confident.read(ing)? food labels”), find_col(”^T1-\sAre you.confident.read(ing)? food labels”), find_col(“^T2-\sAre you.confident.*read(ing)? food labels”) )
label_conf_long <- df %>% select(all_of(label_conf_cols)) %>% setNames(c(“T0”,“T1”,“T2”)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“LabelConfidence”)
p_label_conf <- ggplot(label_conf_long, aes(Timepoint, fill = LabelConfidence)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Confidence Reading Food Labels (T0→T2)”, y = “% of participants”, x = NULL, fill = “Response”) p_label_conf
library(dplyr) # — Check and select the correct meals columns, then plot — # See all columns that mention “meals” grep(“meals”, names(df), value = TRUE)
meals_cols <- c( “T1- In the past week, how many meals did you make using food from the program?…158”, “T2- In the past week, how many meals did you make using food from the program?…159” )
meals_long <- df %>% dplyr::select(dplyr::all_of(meals_cols)) %>% setNames(c(“T1”,“T2”)) %>% tidyr::pivot_longer(dplyr::everything(), names_to = “Timepoint”, values_to = “MealsFromProgram”)
ggplot(meals_long, aes(x = MealsFromProgram)) + geom_bar() + facet_wrap(~ Timepoint, nrow = 1) + labs(title = “Meals Made Using Program Food (T1 vs T2)”, x = “# meals in past week”, y = “Count”)
idx_meals <- grep( “^T[12]-\sIn the past week.how many meals.*(program|program food)“, names(df), ignore.case = TRUE ) meals_cols <- names(df)[idx_meals] meals_cols
ggplot(meals_long, aes(x = MealsFromProgram)) + geom_bar() + facet_wrap(~ Timepoint, nrow = 1) + labs(title = “Meals Made Using Program Food (T1 vs T2)”, x = “# meals in past week”, y = “Count”)
appt_cols <- c( find_col(“^T1-\sSince our last check.appointment.diabetes”), find_col(”^T2-\sSince our last check.appointment.diabetes”) )
appt_long <- df %>% select(all_of(appt_cols)) %>% setNames(c(“T1”,“T2”)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“HadAppt”)
p_appt <- ggplot(appt_long, aes(Timepoint, fill = HadAppt)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Diabetes Appointment Since Last Check-In”, y = “% of participants”, x = NULL, fill = “Response”) p_appt
col_a1c_pre <- “PRE – HbA1c Value (Last 3 Months)” col_a1c_post <- “POST – HbA1c Value (Last 3 Months)”
a1c_long <- df %>% dplyr::transmute(PRE = .data[[col_a1c_pre]], POST = .data[[col_a1c_post]]) %>% tidyr::pivot_longer(dplyr::everything(), names_to = “Time”, values_to = “A1c”)
ggplot(a1c_long, aes(Time, A1c)) + geom_boxplot() + labs(title = “HbA1c: PRE vs POST”, x = NULL, y = “HbA1c (%)”)
a1c_long <- df %>% transmute(PRE = .data[[col_a1c_pre]], POST = .data[[col_a1c_post]]) %>% pivot_longer(everything(), names_to = “Time”, values_to = “A1c”)
p_a1c <- ggplot(a1c_long, aes(Time, A1c)) + geom_boxplot() + labs(title = “HbA1c: PRE vs POST”, x = NULL, y = “HbA1c (%)”) p_a1c
ed_pre <- grep(“^PRE-\sED”, names(df), ignore.case = TRUE, value = TRUE) ed_post <- grep(”^POST-\sED”, names(df), ignore.case = TRUE, value = TRUE) hosp_pre <- grep(“^PRE-\sHosp”, names(df), ignore.case = TRUE, value = TRUE) hosp_post<- grep(”^POST-\sHosp”,names(df), ignore.case = TRUE, value = TRUE)
if (length(ed_pre) & length(ed_post)) { ed_long <- df %>% transmute(PRE = .data[[ed_pre[1]]], POST = .data[[ed_post[1]]]) %>% pivot_longer(everything(), names_to=“Time”, values_to=“ED_Visits”) p_ed <- ggplot(ed_long, aes(Time, ED_Visits)) + stat_summary(fun = mean, geom = “bar”, fill = “grey70”) + stat_summary(fun.data = mean_se, geom = “errorbar”, width = .2) + labs(title = “ED Visits: PRE vs POST (Mean ± SE)”, x = NULL, y = “Visits”) p_ed }
if (length(hosp_pre) & length(hosp_post)) { hosp_long <- df %>% transmute(PRE = .data[[hosp_pre[1]]], POST = .data[[hosp_post[1]]]) %>% pivot_longer(everything(), names_to=“Time”, values_to=“Hosp_Days”) p_hosp <- ggplot(hosp_long, aes(Time, Hosp_Days)) + stat_summary(fun = mean, geom = “bar”, fill = “grey70”) + stat_summary(fun.data = mean_se, geom = “errorbar”, width = .2) + labs(title = “Hospital Days: PRE vs POST (Mean ± SE)”, x = NULL, y = “Days”) p_hosp }
share_cols <- grep(“^T[12]-\sDid you share.”, names(df), ignore.case = TRUE, value = TRUE)
if (length(share_cols)) { # Expect two columns (T1 & T2); keep first two if more share_cols <- share_cols[seq_len(min(2, length(share_cols)))] share_long <- df %>% select(all_of(share_cols)) %>% setNames(sub(“^.(T[12]).”, “\1”, share_cols)) %>% pivot_longer(everything(), names_to=“Timepoint”, values_to=“Shared”) p_share <- ggplot(share_long, aes(Timepoint, fill = Shared)) + geom_bar(position = “fill”) + scale_y_continuous(labels = percent_format_1) + labs(title = “Shared Program Information (T1 vs T2)”, y = “% of participants”, x = NULL, fill = “Response”) p_share }
ls(pattern = “^p_”)
p_age <- ggplot(df, aes(x = Age)) + geom_histogram(binwidth = 5, color = “white”) + labs(title = “Age Distribution”, x = “Age (years)”, y = “Count”)
p_gender <- ggplot(df, aes(x = Legal Sex)) +
geom_bar() + labs(title = “Gender (Legal Sex)”, x = “Legal Sex”, y =
“Count”)
p_race <- ggplot(df, aes(x = Race)) + geom_bar() + labs(title = “Race”, x = “Race”, y = “Count”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_ethnicity <- ggplot(df, aes(x = Ethnicity)) + geom_bar() + labs(title = “Ethnicity”, x = “Ethnicity”, y = “Count”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_age; p_gender; p_race; p_ethnicity
p_age <- ggplot(df, aes(x = Age)) + geom_histogram(binwidth = 5, fill = “steelblue”, color = “white”) + labs(title = “Age Distribution”, x = “Age (years)”, y = “Count”) p_age
p_gender <- ggplot(df, aes(x = Legal Sex)) +
geom_bar(fill = “darkorange”) + labs(title = “Gender Distribution”, x =
“Legal Sex”, y = “Count”) p_gender
p_race <- ggplot(df, aes(x = Race)) + geom_bar(fill = “purple”) + labs(title = “Race Distribution”, x = “Race”, y = “Count”) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) p_race
p_ethnicity <- ggplot(df, aes(x = Ethnicity)) + geom_bar(fill = “seagreen”) + labs(title = “Ethnicity Distribution”, x = “Ethnicity”, y = “Count”) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) p_ethnicity