Insulin resistance (IR)—a state in which the body requires more insulin to achieve normal glucose regulation is a major predictor of type 2 diabetes. Early detection is essential, yet standard blood test measures such as the oral glucose tolerance test, euglycemic–hyperinsulinemic clamp, or the HOMA-IR require are invasive, costly, and cannot be used for frequent or continuous assessment.
Digital biomarkers—physiological, lifestyle and behavioral data collected via wearables and mobile devices—may potentially offer a scalable, non-invasive alternative for early IR screening. We evaluated whether non-invasive digital biomarkers combining sleep, physical activity, nutrfactors, and anthopopometrics, and nutrition can predict HOMA-IR–based IR in adults with normoglycemia, prediabetes and type-2 diabetes.
Demonstrating intitial feasibility, random forest models, built on lifestyle data and anthropometrics, predicted IR out-of-sample (Study 1: balanced accuracy = 0.71, 95% CI: 0.69–0.73, p = .020; AUC = 0.87, 95% CI: 0.85–0.88, p = .020) and replicated in an independent test cohort (Study 2: N=56; balanced accuracy = 0.71, 95% CI: 0.69–0.73, p = .020; AUC = 0.87, 95% CI: 0.85–0.88, p = .020); with top predictors including [insert features]. Results remained robust across modeling specifications.
We use the Flagship Dataset of Type 2 Diabetes from open source AI-READI Project.
We include adult participants across three clinical sites (UAB, UW, UCSD) ) who are normoglycemic, have prediabetes, or have type 2 diabetes, and who meet the following criteria:
We further split participants by site:
Both cohorts follow the same inclusion criteria described above.
# ========================================
# STEP 1: Load datasets
# ========================================
hba1c <- read.delim("~/Downloads/participants.tsv", header = TRUE, stringsAsFactors = FALSE)
biom <- read.csv("~/Downloads/biomarker_data.csv", header = TRUE, stringsAsFactors = FALSE)
df <- merge(hba1c, biom, by.x = "participant_id", by.y = "person_id")
n_start <- length(unique(df$participant_id))
# ========================================
# STEP 2: Process observation file for fasting
# ========================================
ob <- read.delim("~/Downloads/observation(in).csv", header = TRUE, stringsAsFactors = FALSE)
ob_split <- ob %>%
separate(
col = 1,
into = c("observation_id","person_id","observation_concept_id",
"observation_date","observation_datetime","observation_type_concept_id",
"value_as_number","value_as_string","value_as_concept_id",
"qualifier_concept_id","unit_concept_id","provider_id",
"visit_occurrence_id","visit_detail_id","observation_source_value",
"observation_source_concept_id","unit_source_value","value_source_value",
"value_source_concept_id","some_extra1","some_extra2"),
sep = ",", fill = "right"
)
paate_rows <- ob_split %>%
filter(grepl("paate", observation_source_value, ignore.case = TRUE)) %>%
select(value_as_string, person_id)
paate_rows$value_as_string <- as.numeric(paate_rows$value_as_string)
paate_rows$person_id <- as.integer(paate_rows$person_id)
fasting_rows <- paate_rows %>% filter(value_as_string >= 8)
# ========================================
# STEP 3: Sequential filtering
# ========================================
# Step 1: Non-insulin-dependent
df_step1 <- df %>% filter(!study_group %in% "insulin_dependent")
n_step1 <- length(unique(df_step1$participant_id))
removed_step1 <- n_start - n_step1
# Step 2: Fasting 8+ hours
df_step2 <- df_step1 %>%
left_join(fasting_rows, by = c("participant_id" = "person_id")) %>%
filter(!is.na(value_as_string))
n_step2 <- length(unique(df_step2$participant_id))
removed_step2 <- n_step1 - n_step2
# Step 3: Exclude UCSD site
df_step3 <- df_step2 %>% filter(clinical_site != "UCSD")
n_step3 <- length(unique(df_step3$participant_id))
removed_step3 <- n_step2 - n_step3
# Step 4: Wearable data available
df_step4 <- df_step3 %>% filter(wearable_activity_monitor == TRUE)
n_step4 <- length(unique(df_step4$participant_id))
removed_step4 <- n_step3 - n_step4
# Step 5: Remove participants with missing Glucose OR INSULIN
df_step5 <- df_step4 %>% filter(!is.na(Glucose..mg.dL.) & !is.na(INSULIN..ng.mL.))
n_step5 <- length(unique(df_step5$participant_id))
removed_step5 <- n_step4 - n_step5
# Step 6: Compute HOMA_insulin
df_step5$HOMA_insulin <- ((df_step5$INSULIN..ng.mL. * 24) * df_step5$Glucose..mg.dL.) / 405
# Step 7: Remove HOMA outliers (>3 SD above median)
median_homa <- median(df_step5$HOMA_insulin, na.rm = TRUE)
sd_homa <- sd(df_step5$HOMA_insulin, na.rm = TRUE)
df_final <- df_step5 %>%
filter(HOMA_insulin <= (median_homa + 3 * sd_homa))
n_final <- length(unique(df_final$participant_id))
removed_step6 <- n_step5 - n_final
# ========================================
# UCSD-only filtering (Stepwise, remove other sites explicitly)
# ========================================
df_ucsd <- df # start with full dataset
# Step 1: Non-insulin-dependent
df_ucsd_step1 <- df_ucsd %>% filter(!study_group %in% "insulin_dependent")
n_step1_ucsd <- length(unique(df_ucsd_step1$participant_id))
removed_step1_ucsd <- n_start - n_step1_ucsd # same starting n_start
# Step 2: Fasting 8+ hours
df_ucsd_step2 <- df_ucsd_step1 %>%
left_join(fasting_rows, by = c("participant_id" = "person_id")) %>%
filter(!is.na(value_as_string))
n_step2_ucsd <- length(unique(df_ucsd_step2$participant_id))
removed_step2_ucsd <- n_step1_ucsd - n_step2_ucsd
# Step 3: Keep only UCSD, remove participants from other two sites
df_ucsd_step3 <- df_ucsd_step2 %>% filter(clinical_site == "UCSD")
n_step3_ucsd <- length(unique(df_ucsd_step3$participant_id))
removed_step3_ucsd <- n_step2_ucsd - n_step3_ucsd
# Step 4: Wearable data available
df_ucsd_step4 <- df_ucsd_step3 %>% filter(wearable_activity_monitor == TRUE)
n_step4_ucsd <- length(unique(df_ucsd_step4$participant_id))
removed_step4_ucsd <- n_step3_ucsd - n_step4_ucsd
# Step 5: Remove participants with missing Glucose OR INSULIN
df_ucsd_step5 <- df_ucsd_step4 %>% filter(!is.na(Glucose..mg.dL.) & !is.na(INSULIN..ng.mL.))
n_step5_ucsd <- length(unique(df_ucsd_step5$participant_id))
removed_step5_ucsd <- n_step4_ucsd - n_step5_ucsd
# Step 6: Compute HOMA_insulin
df_ucsd_step5$HOMA_insulin <- ((df_ucsd_step5$INSULIN..ng.mL. * 24) * df_ucsd_step5$Glucose..mg.dL.) / 405
# Step 7: Remove HOMA outliers (>3 SD above median)
median_homa_ucsd <- median(df_ucsd_step5$HOMA_insulin, na.rm = TRUE)
sd_homa_ucsd <- sd(df_ucsd_step5$HOMA_insulin, na.rm = TRUE)
df_ucsd_final <- df_ucsd_step5 %>%
filter(HOMA_insulin <= (median_homa_ucsd + 3 * sd_homa_ucsd))
n_final_ucsd <- length(unique(df_ucsd_final$participant_id))
removed_step6_ucsd <- n_step5_ucsd - n_final_ucsd
# ========================================
# Side-by-side PRISMA (original visuals)
# ========================================
grViz(glue('
digraph prisma_sidebyside {{
rankdir = TB;
graph [layout = dot, nodesep = 0.5, ranksep = 0.75]
node [
shape = box,
style = "rounded,filled",
color = "#4A4A4A",
fontcolor = "black",
fillcolor = "#E8F0FE",
fontsize = 12,
penwidth = 1.2,
width = 3
]
# Study 1: Model Development & Testing
subgraph cluster_a {{
label = "Study 1: Model Development & Testing"
a1 [label="Starting dataset\\n(n = {n_start})"]
b1 [label="Non–insulin-dependent\\n(n = {n_step1})\\n(removed {removed_step1})"]
c1 [label="Fasting 8+ hours\\n(n = {n_step2})\\n(removed {removed_step2})"]
d1 [label="Clinical site != UCSD\\n(n = {n_step3})\\n(removed {removed_step3})"]
e1 [label="Have wearable data\\n(n = {n_step4})\\n(removed {removed_step4})"]
f1 [label="Glucose OR INSULIN not missing\\n(n = {n_step5})\\n(removed {removed_step5})"]
g1 [label="HOMA outliers removed\\n(n = {n_final})\\n(removed {removed_step6})"]
a1 -> b1 -> c1 -> d1 -> e1 -> f1 -> g1
}}
# Study 2: External Validation
subgraph cluster_b {{
label = "Study 2: UCSD-only (External Validation)"
a2 [label="Starting dataset\\n(n = {n_start})"]
b2 [label="Non–insulin-dependent\\n(n = {n_step1_ucsd})\\n(removed {removed_step1_ucsd})"]
c2 [label="Fasting 8+ hours\\n(n = {n_step2_ucsd})\\n(removed {removed_step2_ucsd})"]
d2 [label="Only UCSD site\\n(n = {n_step3_ucsd})\\n(removed {removed_step3_ucsd})"]
e2 [label="Have wearable data\\n(n = {n_step4_ucsd})\\n(removed {removed_step4_ucsd})"]
f2 [label="Glucose OR INSULIN not missing\\n(n = {n_step5_ucsd})\\n(removed {removed_step5_ucsd})"]
g2 [label="HOMA outliers removed\\n(n = {n_final_ucsd})\\n(removed {removed_step6_ucsd})"]
a2 -> b2 -> c2 -> d2 -> e2 -> f2 -> g2
}}
}}
'))In the next data preparation section, we take four steps to construct and refine our insulin-resistance measures. First, we calculate a person-level continuous insulin resistance score using the HOMA-IR (Homeostatic Model Assessment for Insulin Resistance) equation, which provides a widely used physiological estimate of insulin resistance derived from fasting glucose and insulin. Second, we inspect and treat outliers to reduce the influence of implausible or extreme values (e.g., non fasting values). Third, we binarize the continuous HOMA-IR score into “insulin resistant” versus “non-resistant” categories, allowing for categorical comparisons and easier interpretability in later modeling. Finally, we descriptively compare our binary insulin-resistance classification to the distribution of HbA1c, which helps illustrate how the HOMA-IR–based categorization aligns with established clinically relevant glycemic marker.
https://www.mdcalc.com/calc/3120/homa-ir-homeostatic-model-assessment-insulin-resistance
We removed values that fell +3 SD deviations from the median to account for improbable values (e.g. individuals who likely did not fast; HOMA-IS >35).
Aplied to 3 individuals in Study 1 and to 1 individual in Study 2.
# -----------------------------
before <- data.frame(HOMA = c(df_step5$HOMA_insulin)) # replace with real data
before_no_outliers <- before %>%
filter(HOMA >= mean(HOMA, na.rm = TRUE) - 3*sd(HOMA, na.rm = TRUE),
HOMA <= mean(HOMA, na.rm = TRUE) + 3*sd(HOMA, na.rm = TRUE))
# -----------------------------
# PANEL A: Density Before
# -----------------------------
mean_val_A <- mean(before$HOMA, na.rm = TRUE)
median_val_A <- median(before$HOMA, na.rm = TRUE)
sd_val_A <- sd(before$HOMA, na.rm = TRUE)
min_val_A <- min(before$HOMA, na.rm = TRUE)
max_val_A <- max(before$HOMA, na.rm = TRUE)
dens_A <- density(before$HOMA, na.rm = TRUE)
ymax_plot_A <- max(dens_A$y)*1.05
legend_x_A <- min_val_A + 0.3*(max_val_A - min_val_A)
symbol_y_A <- seq(ymax_plot_A - 0.08*ymax_plot_A, ymax_plot_A - 0.22*ymax_plot_A, length.out = 3)
legend_y_top_A <- ymax_plot_A - 0.02*ymax_plot_A
legend_df_A <- data.frame(
x = rep(legend_x_A, 3),
y = symbol_y_A,
label = c("Mean","Median","SD"),
text = c(paste0("Mean: ", round(mean_val_A,2)),
paste0("Median: ", round(median_val_A,2)),
paste0("SD: ", round(sd_val_A,2)))
)
min_max_text_A <- paste0("Min: ", round(min_val_A,2), " | Max: ", round(max_val_A,2))
p_density_A <- ggplot(before, aes(HOMA)) +
geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
geom_vline(aes(xintercept=mean_val_A, color="Mean"), linewidth=1) +
geom_vline(aes(xintercept=median_val_A, color="Median"), linewidth=1) +
geom_vline(aes(xintercept=mean_val_A + sd_val_A, color="SD"), linetype="dotted") +
geom_vline(aes(xintercept=mean_val_A - sd_val_A, color="SD"), linetype="dotted") +
geom_point(data=legend_df_A, aes(x=x, y=y, color=label), size=1.5) +
geom_text(data=legend_df_A, aes(x=x + 0.02*(max(before$HOMA)-min(before$HOMA)),
y=y, label=text),
hjust=0, color="black", size=3) + # slightly smaller text
annotate("text", x=legend_x_A, y=legend_y_top_A, label=min_max_text_A,
hjust=0, color="black", size=3) +
scale_color_manual(values=c("Mean"="#00468B","Median"="darkred","SD"="gray")) +
labs(title="A: Including Outliers", x="HOMA-Insulin", y="Density") +
theme_classic(base_size=10) +
theme(legend.position="none",
plot.title = element_text(face="bold", hjust=0, size=12)) +
ylim(0, ymax_plot_A) +
theme(aspect.ratio=0.9)
# -----------------------------
# PANEL B: Density After
# -----------------------------
mean_val_B <- mean(before_no_outliers$HOMA, na.rm = TRUE)
median_val_B <- median(before_no_outliers$HOMA, na.rm = TRUE)
sd_val_B <- sd(before_no_outliers$HOMA, na.rm = TRUE)
min_val_B <- min(before_no_outliers$HOMA, na.rm = TRUE)
max_val_B <- max(before_no_outliers$HOMA, na.rm = TRUE)
dens_B <- density(before_no_outliers$HOMA, na.rm = TRUE)
ymax_plot_B <- max(dens_B$y)*1.05
legend_x_B <- min_val_B + 0.3*(max_val_B - min_val_B)
symbol_y_B <- seq(ymax_plot_B - 0.08*ymax_plot_B, ymax_plot_B - 0.22*ymax_plot_B, length.out = 3)
legend_y_top_B <- ymax_plot_B - 0.02*ymax_plot_B
legend_df_B <- data.frame(
x=rep(legend_x_B,3),
y=symbol_y_B,
label=c("Mean","Median","SD"),
text=c(paste0("Mean: ", round(mean_val_B,2)),
paste0("Median: ", round(median_val_B,2)),
paste0("SD: ", round(sd_val_B,2)))
)
min_max_text_B <- paste0("Min: ", round(min_val_B,2), " | Max: ", round(max_val_B,2))
p_density_B <- ggplot(before_no_outliers, aes(HOMA)) +
geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
geom_vline(aes(xintercept=mean_val_B, color="Mean"), linewidth=1) +
geom_vline(aes(xintercept=median_val_B, color="Median"), linewidth=1) +
geom_vline(aes(xintercept=mean_val_B + sd_val_B, color="SD"), linetype="dotted") +
geom_vline(aes(xintercept=mean_val_B - sd_val_B, color="SD"), linetype="dotted") +
geom_point(data=legend_df_B, aes(x=x, y=y, color=label), size=1.5) +
geom_text(data=legend_df_B, aes(x=x + 0.02*(max(before_no_outliers$HOMA)-min(before_no_outliers$HOMA)),
y=y, label=text),
hjust=0, color="black", size=3) + # slightly smaller text
annotate("text", x=legend_x_B, y=legend_y_top_B, label=min_max_text_B,
hjust=0, color="black", size=3) +
scale_color_manual(values=c("Mean"="#00468B","Median"="darkred","SD"="gray")) +
labs(title="B: Post Outlier Removal", x="HOMA-Insulin", y="Density") +
theme_classic(base_size=10) +
theme(legend.position="none",
plot.title = element_text(face="bold", hjust=0, size=12)) +
ylim(0, ymax_plot_B) +
theme(aspect.ratio=0.9)
# -----------------------------
# PANEL C: Boxplot Before
# -----------------------------
line_3sd <- median(before$HOMA, na.rm = TRUE) + 3*sd(before$HOMA, na.rm = TRUE)
# Add a color column: dark red if > median + 3 SD, else default
before <- before %>%
mutate(color_flag = ifelse(HOMA > line_3sd, "above_3SD", "normal"))
# Plot
p_A <- ggplot(before, aes(x = "", y = HOMA)) +
geom_boxplot(width = 0.4, fill = "#A5D8FF", alpha = 0.7, outlier.shape = NA) +
geom_jitter(aes(color = color_flag), width = 0.1, size = 2, alpha = 0.6) +
scale_color_manual(values = c("normal" = "#1B6CA8", "above_3SD" = "darkred")) +
geom_hline(yintercept = line_3sd, linetype = "dashed", color = "gray", size = 0.7) +
annotate("text", x = 1.2, y = line_3sd + 2,
label = paste0("Med. + 3SD = ", round(line_3sd,2)),
hjust = 0, color = "gray20", size = 3) +
labs(title = "C: Including Outliers", x = "", y = "HOMA-Insulin") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0, size = 12),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "none") +
theme(aspect.ratio = 0.9)
# -----------------------------
# PANEL D: Boxplot After
# -----------------------------
p_B <- ggplot(before_no_outliers, aes(x="", y=HOMA)) +
geom_boxplot(width=0.4, fill="#A5D8FF", alpha=0.7, outlier.shape=NA) +
geom_jitter(width=0.1, size=2, color="#1B6CA8", alpha=0.4) +
labs(title="D: Post Outlier Removal", x="", y="HOMA-Insulin") +
theme_classic(base_size=10) +
theme(plot.title=element_text(face="bold", hjust=0, size=12),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(aspect.ratio=0.9)
# -----------------------------
# Combine 4 plots in 2x2 grid
# -----------------------------
plot_grid(p_density_A, p_density_B,
p_A, p_B,
ncol=2, align="hv")# -----------------------------
# Prepare data
# -----------------------------
before <- data.frame(HOMA = df_ucsd_step5$HOMA_insulin)
before_no_outliers <- before %>%
filter(HOMA >= mean(HOMA, na.rm = TRUE) - 3*sd(HOMA, na.rm = TRUE),
HOMA <= mean(HOMA, na.rm = TRUE) + 3*sd(HOMA, na.rm = TRUE))
# -----------------------------
# PANEL A: Density Before
# -----------------------------
mean_val_A <- mean(before$HOMA, na.rm = TRUE)
median_val_A <- median(before$HOMA, na.rm = TRUE)
sd_val_A <- sd(before$HOMA, na.rm = TRUE)
min_val_A <- min(before$HOMA, na.rm = TRUE)
max_val_A <- max(before$HOMA, na.rm = TRUE)
dens_A <- density(before$HOMA, na.rm = TRUE)
ymax_plot_A <- max(dens_A$y)*1.05
legend_x_A <- min_val_A + 0.3*(max_val_A - min_val_A)
symbol_y_A <- seq(ymax_plot_A - 0.08*ymax_plot_A, ymax_plot_A - 0.22*ymax_plot_A, length.out = 3)
legend_y_top_A <- ymax_plot_A - 0.02*ymax_plot_A
legend_df_A <- data.frame(
x = rep(legend_x_A, 3),
y = symbol_y_A,
label = c("Mean","Median","SD"),
text = c(paste0("Mean: ", round(mean_val_A,2)),
paste0("Median: ", round(median_val_A,2)),
paste0("SD: ", round(sd_val_A,2)))
)
min_max_text_A <- paste0("Min: ", round(min_val_A,2), " | Max: ", round(max_val_A,2))
p_density_A <- ggplot(before, aes(HOMA)) +
geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
geom_vline(aes(xintercept=mean_val_A, color="Mean"), linewidth=1) +
geom_vline(aes(xintercept=median_val_A, color="Median"), linewidth=1) +
geom_vline(aes(xintercept=mean_val_A + sd_val_A, color="SD"), linetype="dotted") +
geom_vline(aes(xintercept=mean_val_A - sd_val_A, color="SD"), linetype="dotted") +
geom_point(data=legend_df_A, aes(x=x, y=y, color=label), size=1.5) +
geom_text(data=legend_df_A, aes(x=x + 0.02*(max(before$HOMA)-min(before$HOMA)), y=y, label=text),
hjust=0, color="black", size=3) +
annotate("text", x=legend_x_A, y=legend_y_top_A, label=min_max_text_A,
hjust=0, color="black", size=3) +
scale_color_manual(values=c("Mean"="darkred","Median"="#005b96","SD"="#011f4b")) +
labs(title="A: Including Outliers", x="HOMA-Insulin", y="Density") +
theme_classic(base_size=10) +
theme(legend.position="none",
plot.title = element_text(face="bold", hjust=0, size=12)) +
ylim(0, ymax_plot_A) +
theme(aspect.ratio=0.9)
# -----------------------------
# PANEL B: Density After
# -----------------------------
mean_val_B <- mean(before_no_outliers$HOMA, na.rm = TRUE)
median_val_B <- median(before_no_outliers$HOMA, na.rm = TRUE)
sd_val_B <- sd(before_no_outliers$HOMA, na.rm = TRUE)
min_val_B <- min(before_no_outliers$HOMA, na.rm = TRUE)
max_val_B <- max(before_no_outliers$HOMA, na.rm = TRUE)
dens_B <- density(before_no_outliers$HOMA, na.rm = TRUE)
ymax_plot_B <- max(dens_B$y)*1.05
legend_x_B <- min_val_B + 0.3*(max_val_B - min_val_B)
symbol_y_B <- seq(ymax_plot_B - 0.08*ymax_plot_B, ymax_plot_B - 0.22*ymax_plot_B, length.out = 3)
legend_y_top_B <- ymax_plot_B - 0.02*ymax_plot_B
legend_df_B <- data.frame(
x=rep(legend_x_B,3),
y=symbol_y_B,
label=c("Mean","Median","SD"),
text=c(paste0("Mean: ", round(mean_val_B,2)),
paste0("Median: ", round(median_val_B,2)),
paste0("SD: ", round(sd_val_B,2)))
)
min_max_text_B <- paste0("Min: ", round(min_val_B,2), " | Max: ", round(max_val_B,2))
p_density_B <- ggplot(before_no_outliers, aes(HOMA)) +
geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
geom_vline(aes(xintercept=mean_val_B, color="Mean"), linewidth=1) +
geom_vline(aes(xintercept=median_val_B, color="Median"), linewidth=1) +
geom_vline(aes(xintercept=mean_val_B + sd_val_B, color="SD"), linetype="dotted") +
geom_vline(aes(xintercept=mean_val_B - sd_val_B, color="SD"), linetype="dotted") +
geom_point(data=legend_df_B, aes(x=x, y=y, color=label), size=1.5) +
geom_text(data=legend_df_B, aes(x=x + 0.02*(max(before_no_outliers$HOMA)-min(before_no_outliers$HOMA)),
y=y, label=text),
hjust=0, color="black", size=3) +
annotate("text", x=legend_x_B, y=legend_y_top_B, label=min_max_text_B,
hjust=0, color="black", size=3) +
scale_color_manual(values=c("Mean"="darkred","Median"="#005b96","SD"="#011f4b")) +
labs(title="B: Post Outlier Removal", x="HOMA-Insulin", y="Density") +
theme_classic(base_size=10) +
theme(legend.position="none",
plot.title = element_text(face="bold", hjust=0, size=12)) +
ylim(0, ymax_plot_B) +
theme(aspect.ratio=0.9)
# -----------------------------
# PANEL C: Boxplot Before
# -----------------------------
line_3sd <- median(before$HOMA, na.rm = TRUE) + 3*sd(before$HOMA, na.rm = TRUE)
before <- before %>%
mutate(color_flag = ifelse(HOMA > line_3sd, "above_3SD", "normal"))
p_A <- ggplot(before, aes(x = "", y = HOMA)) +
geom_boxplot(width = 0.4, fill = "#A5D8FF", alpha = 0.7, outlier.shape = NA) +
geom_jitter(aes(color = color_flag), width = 0.1, size = 2, alpha = 0.6) +
scale_color_manual(values = c("normal" = "#1B6CA8", "above_3SD" = "darkred")) +
geom_hline(yintercept = line_3sd, linetype = "dashed", color = "gray", size = 0.7) +
annotate("text", x = 1.2, y = line_3sd + 2,
label = paste0("Med. + 3SD = ", round(line_3sd,2)),
hjust = 0, color = "gray20", size = 3) +
labs(title = "C: Including Outliers", x = "", y = "HOMA-Insulin") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0, size = 12),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "none") +
theme(aspect.ratio = 0.9)
# -----------------------------
# PANEL D: Boxplot After
# -----------------------------
p_B <- ggplot(before_no_outliers, aes(x="", y=HOMA)) +
geom_boxplot(width=0.4, fill="#A5D8FF", alpha=0.7, outlier.shape=NA) +
geom_jitter(width=0.1, size=2, color="#1B6CA8", alpha=0.4) +
labs(title="D: Post Outlier Removal", x="", y="HOMA-Insulin") +
theme_classic(base_size=10) +
theme(plot.title=element_text(face="bold", hjust=0, size=12),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(aspect.ratio=0.9)
# -----------------------------
# Combine 4 plots in 2x2 grid
# -----------------------------
# --- Top row: density plots ---
top_row <- cowplot::plot_grid(p_density_A, p_density_B, ncol = 2, labels = c("A", "B"))
# --- Bottom row: HOMA plots ---
bottom_row <- cowplot::plot_grid(p_A, p_B, ncol = 2, labels = c("C", "D"))
# --- Combine top and bottom rows vertically ---
combined_plot <- cowplot::plot_grid(top_row, bottom_row, ncol = 1, align = "v")
combined_plot# ============================================================
# Load libraries
# ============================================================
# ============================================================
# Shared Colors
# ============================================================
colors_homa <- c("Non-IR" = "#1B6CA8", "IR" = "#953553")
colors_hba1c <- c(
"Normoglycemic" = "#1B6CA8",
"Prediabetic" = "#E69F00",
"T2 Diabetic" = "#953553"
)
# ============================================================
# -------------------- Study 1: HOMA-IR -----------------------
# ============================================================
df_f <- df_final %>%
mutate(
IR_status = ifelse(HOMA_insulin >= 2.5, 1, 0),
IR_label = ifelse(IR_status == 1, "IR", "Non-IR")
)
counts_homa <- df_f %>% group_by(IR_label) %>% summarise(n = n())
legend_labels_homa <- paste0(counts_homa$IR_label, ": ", counts_homa$n)
names(legend_labels_homa) <- counts_homa$IR_label
p_homa <- ggplot(df_f, aes(x = "", y = HOMA_insulin, color = IR_label)) +
geom_jitter(width = 0.1, size = 2, alpha = 0.8) +
geom_hline(yintercept = 2.5, linetype = "dashed", color = "gray40", size = 1) +
labs(title = "Study 1: HOMA-IR ≥ 2.5", y = "HOMA-IR", x = "", color = "IR Status") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
aspect.ratio = 0.9) +
scale_color_manual(values = colors_homa, labels = legend_labels_homa)
# ============================================================
# -------------------- Study 2: HOMA-IR -----------------------
# ============================================================
df_s2 <- df_ucsd_final %>%
mutate(
IR_status = ifelse(HOMA_insulin >= 2.5, 1, 0),
IR_label = ifelse(IR_status == 1, "IR", "Non-IR")
)
counts_homa_s2 <- df_s2 %>% group_by(IR_label) %>% summarise(n = n())
legend_labels_homa_s2 <- paste0(counts_homa_s2$IR_label, ": ", counts_homa_s2$n)
names(legend_labels_homa_s2) <- counts_homa_s2$IR_label
p_homa_s2 <- ggplot(df_s2, aes(x = "", y = HOMA_insulin, color = IR_label)) +
geom_jitter(width = 0.1, size = 2, alpha = 0.8) +
geom_hline(yintercept = 2.5, linetype = "dashed", color = "gray40", size = 1) +
labs(title = "Study 2: HOMA-IR ≥ 2.5", y = "HOMA-IR", x = "", color = "IR Status") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
aspect.ratio = 0.9) +
scale_color_manual(values = colors_homa, labels = legend_labels_homa_s2)
# ============================================================
# -------------------- Study 1: HbA1c -------------------------
# ============================================================
df_final <- df_final %>%
mutate(
HbA1c_value = HbA1c....,
Glycemic_label = case_when(
HbA1c_value < 5.7 ~ "Normoglycemic",
HbA1c_value >= 5.7 & HbA1c_value < 6.5 ~ "Prediabetic",
HbA1c_value >= 6.5 ~ "T2 Diabetic"
)
)
counts_hba1c <- df_final %>% group_by(Glycemic_label) %>% summarise(n = n())
legend_labels_hba1c <- paste0(counts_hba1c$Glycemic_label, ": ", counts_hba1c$n)
names(legend_labels_hba1c) <- counts_hba1c$Glycemic_label
p_hba1c <- ggplot(df_final, aes(x = "", y = HbA1c_value, color = Glycemic_label)) +
geom_jitter(width = 0.12, size = 2, alpha = 0.8) +
geom_hline(yintercept = 5.7, linetype = "dashed", color = "gray40", size = 1) +
geom_hline(yintercept = 6.5, linetype = "dashed", color = "gray40", size = 1) +
labs(title = "Study 1: HbA1c Classification", y = "HbA1c (%)", x = "", color = "Status") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
aspect.ratio = 1) +
scale_color_manual(values = colors_hba1c, labels = legend_labels_hba1c)
# ============================================================
# -------------------- Study 2: HbA1c -------------------------
# ============================================================
df_s2 <- df_ucsd_final %>%
mutate(
HbA1c_value = HbA1c....,
Glycemic_label = case_when(
HbA1c_value < 5.7 ~ "Normoglycemic",
HbA1c_value >= 5.7 & HbA1c_value < 6.5 ~ "Prediabetic",
HbA1c_value >= 6.5 ~ "T2 Diabetic"
)
)
counts_hba1c_s2 <- df_s2 %>% group_by(Glycemic_label) %>% summarise(n = n())
legend_labels_hba1c_s2 <- paste0(counts_hba1c_s2$Glycemic_label, ": ", counts_hba1c_s2$n)
names(legend_labels_hba1c_s2) <- counts_hba1c_s2$Glycemic_label
p_hba1c_s2 <- ggplot(df_s2, aes(x = "", y = HbA1c_value, color = Glycemic_label)) +
geom_jitter(width = 0.12, size = 2, alpha = 0.8) +
geom_hline(yintercept = 5.7, linetype = "dashed", color = "gray40", size = 1) +
geom_hline(yintercept = 6.5, linetype = "dashed", color = "gray40", size = 1) +
labs(title = "Study 2: HbA1c Classification", y = "HbA1c (%)", x = "", color = "Status") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
aspect.ratio = 1) +
scale_color_manual(values = colors_hba1c, labels = legend_labels_hba1c_s2)
# ============================================================
# -------------------- Combine All Plots ----------------------
# ============================================================
# Row 1: HOMA-IR (Study 1 vs Study 2)
row1 <- p_homa | p_homa_s2
# Row 2: HbA1c (Study 1 vs Study 2)
row2 <- p_hba1c | p_hba1c_s2
# Full figure
full_plot <- row1 / row2
full_plot# ============================================================
# Function to summarize HbA1c per IR group with range
# ============================================================
summarize_hba1c_IR <- function(df, study_name) {
df <- df %>%
mutate(
HbA1c_value = as.numeric(HbA1c....),
IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR")
)
na_count <- sum(is.na(df$HbA1c_value))
df_clean <- df %>% filter(!is.na(HbA1c_value))
descriptives <- df_clean %>%
group_by(IR_label) %>%
summarise(
N = n(),
Mean = round(mean(HbA1c_value), 2),
SD = round(sd(HbA1c_value), 2),
Median = round(median(HbA1c_value), 2),
Range = paste0(round(min(HbA1c_value),2), "–", round(max(HbA1c_value),2)),
.groups = "drop"
) %>%
mutate(Study = study_name,
N_Missing = na_count) %>%
select(Study, IR_label, N, N_Missing, Mean, SD, Median, Range)
# T-test (only if 2 groups)
if(length(unique(df_clean$IR_label)) == 2){
t_res <- t.test(HbA1c_value ~ IR_label, data = df_clean)
t_test <- tibble(
Study = study_name,
t = round(t_res$statistic, 2),
df = round(t_res$parameter, 1),
p = signif(t_res$p.value, 3)
)
} else {
t_test <- tibble(
Study = study_name,
t = NA, df = NA, p = NA
)
}
list(descriptives = descriptives, t_test = t_test)
}
# ============================================================
# Summarize studies
# ============================================================
summary_s1 <- summarize_hba1c_IR(df_final, "Study 1")
summary_s2 <- summarize_hba1c_IR(df_ucsd_final, "Study 2")
descriptives_table <- bind_rows(summary_s1$descriptives, summary_s2$descriptives)
t_tests_table <- bind_rows(summary_s1$t_test, summary_s2$t_test)
# ============================================================
# Merge for display, only show t-test once per study
# ============================================================
final_table <- descriptives_table %>%
left_join(t_tests_table, by = "Study") %>%
arrange(Study, IR_label)
# Remove t-test values for second row of each study to only display once
final_table <- final_table %>%
group_by(Study) %>%
mutate(
t = ifelse(row_number() == 1, t, NA),
df = ifelse(row_number() == 1, df, NA),
p = ifelse(row_number() == 1, p, NA)
) %>%
ungroup()
# ============================================================
# Display as HTML kable
# ============================================================
kable(final_table, "html", caption = "HbA1c Summary by IR Status") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = T, font_size = 14) %>%
row_spec(0, bold = TRUE, background = "#F5FBFF") %>%
collapse_rows(columns = 1, valign = "top") %>%
add_header_above(c(" " = 2, "Descriptive Statistics" = 6, "T-Test (Non-IR vs IR)" = 3))| Study | IR_label | N | N_Missing | Mean | SD | Median | Range | t | df | p |
|---|---|---|---|---|---|---|---|---|---|---|
| Study 1 | IR | 61 | 1 | 6.35 | 1.07 | 6.1 | 4.5–10.7 | 4.71 | 86.8 | 9.3e-06 |
| Non-IR | 76 | 1 | 5.63 | 0.57 | 5.5 | 4.6–7.4 | NA | NA | NA | |
| Study 2 | IR | 26 | 0 | 6.22 | 0.67 | 6.1 | 5.2–7.7 | 3.21 | 41.2 | 2.6e-03 |
| Non-IR | 30 | 0 | 5.73 | 0.43 | 5.7 | 4.9–6.6 | NA | NA | NA |
# ============================================================
# 1. Create IR label
# ============================================================
df_final <- df_final %>%
mutate(IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR"))
# ============================================================
# 2. Original variables to plot
# ============================================================
vars_to_plot <- c(
"age",
"weight_vsorres..Weight..kilograms.",
"bmi_vsorres..BMI",
"hip_vsorres..Hip.Circumference..cm.",
"waist_vsorres..Waist.Circumference..cm.",
"whr_vsorres..Waist.to.Hip.Ratio..WHR.",
"pulse_vsorres..Heart.Rate..bpm.",
"bp1_diabp_vsorres..Diastolic..mmHg.",
"bp2_diabp_vsorres..Diastolic..mmHg.",
"bp1_sysbp_vsorres..Systolic..mmHg.",
"bp2_sysbp_vsorres..Systolic..mmHg.",
"clinical_site"
)
# Keep only existing columns
vars_to_plot <- vars_to_plot[vars_to_plot %in% names(df_final)]
# ============================================================
# 3. Short names for titles
# ============================================================
short_names <- c(
"bmi_vsorres..BMI" = "BMI",
"bp1_diabp_vsorres..Diastolic..mmHg." = "Diastolic_BP1",
"bp2_diabp_vsorres..Diastolic..mmHg." = "Diastolic_BP2",
"bp1_sysbp_vsorres..Systolic..mmHg." = "Systolic_BP1",
"bp2_sysbp_vsorres..Systolic..mmHg." = "Systolic_BP2",
"pulse_vsorres..Heart.Rate..bpm." = "PulseHR",
"waist_vsorres..Waist.Circumference..cm." = "Waist_Circumference",
"hip_vsorres..Hip.Circumference..cm." = "Hip_Circumference",
"weight_vsorres..Weight..kilograms." = "Weight (kg)",
"whr_vsorres..Waist.to.Hip.Ratio..WHR." = "WaistHip_Ratio",
"age" = "Age",
"clinical_site" = "Clinical_Site"
)
# Only keep short names for variables present
short_names <- short_names[names(short_names) %in% vars_to_plot]
# ============================================================
# 4. Colors
# ============================================================
fill_colors <- c("Non-IR" = "#A5D8FF", "IR" = "#FFB6C1")
edge_colors <- c("Non-IR" = "#1B6CA8", "IR" = "#C71585")
# ============================================================
# 5. Plotting function with p-value on the right
# ============================================================
plot_ir_variable <- function(var_name, data, short_names, fill_colors, edge_colors) {
if (!var_name %in% names(data)) return(NULL)
plot_data <- data %>% filter(!is.na(.data[[var_name]]))
if (nrow(plot_data) == 0) return(NULL)
plot_title <- ifelse(var_name %in% names(short_names), short_names[[var_name]], var_name)
x <- plot_data[[var_name]]
# Compute p-value
p_val <- if (is.numeric(x)) {
tryCatch(t.test(x ~ IR_label, data = plot_data)$p.value, error = function(e) NA)
} else {
tbl <- table(x, plot_data$IR_label)
tryCatch(chisq.test(tbl)$p.value, error = function(e) NA)
}
# Format p-value as text
p_text <- if (!is.na(p_val)) {
if (p_val < 0.001) "p < 0.001" else paste0("p = ", signif(p_val, 3))
} else {
"p = NA"
}
# Numeric plot
if (is.numeric(x)) {
max_y <- max(x, na.rm = TRUE)
min_y <- min(x, na.rm = TRUE)
p <- ggplot(plot_data, aes(x = IR_label, y = .data[[var_name]], fill = IR_label, color = IR_label)) +
geom_violin(alpha = 0.5, size = 0.8) +
geom_jitter(width = 0.1, size = 1.5, alpha = 0.8) +
scale_fill_manual(values = fill_colors) +
scale_color_manual(values = edge_colors) +
labs(title = plot_title, x = "IR Status", y = plot_title) +
theme_classic(base_size = 9) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9), axis.text.y = element_text(face = "bold"),
legend.position = "none",
aspect.ratio = 1.3)
} else { # Categorical plot
p <- ggplot(plot_data, aes(x = .data[[var_name]], fill = IR_label)) +
geom_bar(position = "dodge", color = "black") +
scale_fill_manual(values = fill_colors) +
labs(title = plot_title, x = plot_title, y = "Count") +
theme_classic(base_size = 9) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(face = "bold"),
legend.position = "none",
aspect.ratio = 1.3)
}
# Add p-value text annotation to the right
# p <- p + annotate("text", x = Inf, y = Inf, label = p_text,
# hjust = 1, vjust = 2.5, size = 3, fontface = "italic")
return(p)
# x = Inf, y = Inf,
}
# ============================================================
# 6. Generate plots
# ============================================================
plot_list <- lapply(
vars_to_plot,
plot_ir_variable,
data = df_final,
short_names = short_names,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list <- plot_list[!sapply(plot_list, is.null)]
# ============================================================
# 7. Combine into grid (4 columns)
# ============================================================
big_panel <- wrap_plots(plot_list, ncol = 4)
big_panel# Separate numeric and categorical variables
# ============================
# Identify numeric vs categorical variables
# ============================
num_vars <- vars_to_plot[sapply(df_final[vars_to_plot], is.numeric)]
cat_vars <- vars_to_plot[sapply(df_final[vars_to_plot], function(x) !is.numeric(x))]
# ============================
# Numeric summary
# ============================
num_summary <- lapply(num_vars, function(var) {
data <- df_final %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
# t-test
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
# format p-value
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
IR = ifelse(length(IR_vals) > 0,
sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)), "–"),
Non_IR = ifelse(length(NonIR_vals) > 0,
sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)), "–"),
Total = ifelse(length(Total_vals) > 0,
sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)), "–"),
`p-value` = p_val
)
}) %>% bind_rows()
# ============================
# Categorical summary
# ============================
cat_summary <- lapply(cat_vars, function(var) {
tbl <- table(df_final[[var]], df_final$IR_label)
# Safe extraction of columns
IR_col <- if ("IR" %in% colnames(tbl)) tbl[, "IR"] else rep(0, nrow(tbl))
NonIR_col <- if ("Non-IR" %in% colnames(tbl)) tbl[, "Non-IR"] else rep(0, nrow(tbl))
IR_total <- sum(IR_col)
NonIR_total <- sum(NonIR_col)
total_total <- sum(tbl)
# chi-square test
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
# calculate percentages per column (so IR and Non-IR each sum to 100%)
IR_str <- sprintf("%d (%.1f%%)", IR_col, 100 * IR_col / IR_total)
NonIR_str <- sprintf("%d (%.1f%%)", NonIR_col, 100 * NonIR_col / NonIR_total)
Total_str <- sprintf("%d (%.1f%%)", IR_col + NonIR_col, 100 * (IR_col + NonIR_col) / total_total)
tibble(
Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
# ============================
# Combine tables
# ============================
final_table <- bind_rows(num_summary, cat_summary)
# ============================
# Add APA description row
# ============================
description_row <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD OR count (%)",
Non_IR = "Non-IR = mean ± SD OR count (%)",
Total = "Total = mean ± SD OR count (%)",
`p-value` = "t-test or χ²"
)
final_table2 <- bind_rows(description_row, final_table)
# ============================
# Add significance stars + bold
# ============================
add_stars <- function(p) {
p_num <- suppressWarnings(as.numeric(p))
# Handle "<0.001"
if (grepl("<0.001", p)) return("**<0.001** ***")
# Handle non-numeric entries (e.g., "–")
if (is.na(p_num)) return(p)
# Assign stars
if (p_num < 0.001) return("**<0.001** ***")
if (p_num < 0.01) return(paste0("**", p, "**", " **"))
if (p_num < 0.05) return(paste0("**", p, "**", " *"))
return(p)
}
final_table2$`p-value` <- sapply(final_table2$`p-value`, add_stars)
# ============================
# Identify significant rows
# ============================
p_char <- final_table$`p-value`
# Remove "<" and convert to numeric
p_numeric <- as.numeric(gsub("<", "", p_char))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1 # +1 for description row
# ============================
# Render APA-style table
# ============================
kbl(final_table2, "html",
caption = "Table 1. Comparison of clinical visit variables by IR status", escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>% # header row
row_spec(1, italic = TRUE, background = "#F0F0F0") %>% # description row
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value |
|---|---|---|---|---|
| Note: | IR = mean ± SD OR count (%) | Non-IR = mean ± SD OR count (%) | Total = mean ± SD OR count (%) | t-test or χ² |
| Age | 55.42 ± 9.13 | 56.62 ± 11.16 | 56.08 ± 10.28 | 0.489 |
| Weight (kg) | 99.21 ± 22.99 | 80.34 ± 25.54 | 88.82 ± 26.10 | <0.001 *** |
| BMI | 35.43 ± 8.08 | 29.31 ± 7.23 | 32.06 ± 8.19 | <0.001 *** |
| Hip_Circumference | 120.76 ± 16.65 | 109.22 ± 16.58 | 114.40 ± 17.52 | <0.001 *** |
| Waist_Circumference | 112.47 ± 14.91 | 95.57 ± 15.07 | 103.16 ± 17.16 | <0.001 *** |
| WaistHip_Ratio | 0.94 ± 0.08 | 0.88 ± 0.08 | 0.90 ± 0.09 | <0.001 *** |
| PulseHR | 72.65 ± 10.65 | 70.33 ± 11.62 | 71.37 ± 11.21 | 0.225 |
| Diastolic_BP1 | 82.90 ± 9.92 | 78.05 ± 8.99 | 80.23 ± 9.69 | 0.003 ** |
| Diastolic_BP2 | 79.82 ± 8.68 | 78.55 ± 9.57 | 79.12 ± 9.17 | 0.416 |
| Systolic_BP1 | 133.00 ± 14.50 | 128.36 ± 14.42 | 130.44 ± 14.59 | 0.063 |
| Systolic_BP2 | 130.85 ± 13.79 | 126.37 ± 15.87 | 128.38 ± 15.09 | 0.078 |
| Clinical_Site | 37 (59.7%); 25 (40.3%) | 42 (55.3%); 34 (44.7%) | 79 (57.2%); 59 (42.8%) | 0.728 |
# ============================================================
# 1. Create IR label
# ============================================================
df_ucsd_final <- df_ucsd_final %>%
mutate(IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR"),
clinical_site = factor(clinical_site)) # ensure proper categorical plotting
# ============================================================
# 2. Original variables to plot
# ============================================================
vars_to_plot <- c(
"age",
"weight_vsorres..Weight..kilograms.",
"bmi_vsorres..BMI",
"hip_vsorres..Hip.Circumference..cm.",
"waist_vsorres..Waist.Circumference..cm.",
"whr_vsorres..Waist.to.Hip.Ratio..WHR.",
"pulse_vsorres..Heart.Rate..bpm.",
"bp1_diabp_vsorres..Diastolic..mmHg.",
"bp2_diabp_vsorres..Diastolic..mmHg.",
"bp1_sysbp_vsorres..Systolic..mmHg.",
"bp2_sysbp_vsorres..Systolic..mmHg.",
"clinical_site"
)
vars_to_plot <- vars_to_plot[vars_to_plot %in% names(df_ucsd_final)]
# ============================================================
# 3. Short names for titles
# ============================================================
short_names <- c(
"bmi_vsorres..BMI" = "BMI",
"bp1_diabp_vsorres..Diastolic..mmHg." = "Diastolic_BP1",
"bp2_diabp_vsorres..Diastolic..mmHg." = "Diastolic_BP2",
"bp1_sysbp_vsorres..Systolic..mmHg." = "Systolic_BP1",
"bp2_sysbp_vsorres..Systolic..mmHg." = "Systolic_BP2",
"pulse_vsorres..Heart.Rate..bpm." = "PulseHR",
"waist_vsorres..Waist.Circumference..cm." = "Waist_Circumference",
"hip_vsorres..Hip.Circumference..cm." = "Hip_Circumference",
"weight_vsorres..Weight..kilograms." = "Weight (kg)",
"whr_vsorres..Waist.to.Hip.Ratio..WHR." = "WaistHip_Ratio",
"age" = "Age",
"clinical_site" = "Clinical_Site"
)
short_names <- short_names[names(short_names) %in% vars_to_plot]
# ============================================================
# 4. Colors
# ============================================================
fill_colors <- c("Non-IR" = "#A5D8FF", "IR" = "#FFB6C1")
edge_colors <- c("Non-IR" = "#1B6CA8", "IR" = "#C71585")
# ============================================================
# 5. Plotting function with p-value on the right
# ============================================================
plot_ir_variable <- function(var_name, data, short_names, fill_colors, edge_colors) {
if (!var_name %in% names(data)) return(NULL)
plot_data <- data %>% filter(!is.na(.data[[var_name]]))
if (nrow(plot_data) == 0) return(NULL)
plot_title <- ifelse(var_name %in% names(short_names), short_names[[var_name]], var_name)
x <- plot_data[[var_name]]
# Compute p-value
p_val <- if (is.numeric(x)) {
tryCatch(t.test(x ~ IR_label, data = plot_data)$p.value, error = function(e) NA)
} else {
tbl <- table(x, plot_data$IR_label)
tryCatch(chisq.test(tbl)$p.value, error = function(e) NA)
}
# Format p-value as text
p_text <- if (!is.na(p_val)) {
if (p_val < 0.001) "p < 0.001" else paste0("p = ", signif(p_val, 3))
} else {
"p = NA"
}
if (is.numeric(x)) {
# Numeric variables: violin + jitter
p <- ggplot(plot_data, aes(x = IR_label, y = .data[[var_name]], fill = IR_label, color = IR_label)) +
geom_violin(alpha = 0.5, size = 0.8) +
geom_jitter(width = 0.1, size = 1.5, alpha = 0.8) +
scale_fill_manual(values = fill_colors) +
scale_color_manual(values = edge_colors) +
labs(title = plot_title, x = "IR Status", y = plot_title) +
theme_classic(base_size = 9) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
axis.text.y = element_text(face = "bold"),
legend.position = "none",
aspect.ratio = 1.3)
} else {
# Ensure factor for categorical variables
plot_data[[var_name]] <- factor(plot_data[[var_name]], levels = unique(plot_data[[var_name]]))
# Generalized categorical plot
tbl <- plot_data %>%
group_by(.data[[var_name]], IR_label) %>%
summarise(count = n(), .groups = "drop") %>%
tidyr::complete(.data[[var_name]], IR_label = c("IR", "Non-IR"), fill = list(count = 0)) %>%
group_by(IR_label) %>%
mutate(perc = 100 * count / sum(count)) %>%
ungroup()
# Use dodge if multiple categories, single bar if only one (like clinical_site with one site)
if (length(unique(plot_data[[var_name]])) == 1) {
# Single bar
p <- ggplot(tbl, aes(x = .data[[var_name]], y = perc, fill = IR_label)) +
geom_col(color = "black") +
scale_fill_manual(values = fill_colors) +
labs(title = plot_title, x = "", y = "Percent") +
theme_classic(base_size = 9) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
legend.position = "right",
aspect.ratio = 1)
} else {
# Multiple categories
p <- ggplot(tbl, aes(x = .data[[var_name]], y = perc, fill = IR_label)) +
geom_col(position = "dodge", color = "black") +
scale_fill_manual(values = fill_colors) +
labs(title = plot_title, x = plot_title, y = "Percent") +
theme_classic(base_size = 9) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(face = "bold"),
legend.position = "none",
aspect.ratio = 1.3)
}
}
return(p)
}
# ============================================================
# 6. Generate plots
# ============================================================
plot_list <- lapply(
vars_to_plot,
plot_ir_variable,
data = df_ucsd_final,
short_names = short_names,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list <- plot_list[!sapply(plot_list, is.null)]
big_panel <- wrap_plots(plot_list, ncol = 4)
big_panel# ============================================================
# 7. Numeric and categorical summaries
# ============================================================
# ============================================================
# 7. Numeric and categorical summaries
# ============================================================
num_vars <- vars_to_plot[sapply(df_ucsd_final[vars_to_plot], is.numeric)]
cat_vars <- vars_to_plot[sapply(df_ucsd_final[vars_to_plot], function(x) !is.numeric(x))]
# -----------------------------
# Numeric summary
# -----------------------------
num_summary <- lapply(num_vars, function(var) {
data <- df_ucsd_final %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
IR = ifelse(length(IR_vals) > 0, sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)), "–"),
Non_IR = ifelse(length(NonIR_vals) > 0, sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)), "–"),
Total = ifelse(length(Total_vals) > 0, sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)), "–"),
`p-value` = p_val
)
}) %>% bind_rows()
# -----------------------------
# Categorical summary (fixed)
# -----------------------------
cat_summary <- lapply(cat_vars, function(var) {
tbl <- table(df_ucsd_final[[var]], df_ucsd_final$IR_label)
# Safe extraction
IR_col <- if ("IR" %in% colnames(tbl)) tbl[, "IR"] else rep(0, nrow(tbl))
NonIR_col <- if ("Non-IR" %in% colnames(tbl)) tbl[, "Non-IR"] else rep(0, nrow(tbl))
total_col <- IR_col + NonIR_col
total_total <- sum(total_col)
# Chi-square test
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
# Percent of **total subjects** per group
IR_str <- sprintf("%d (%.1f%%)", IR_col, 100 * IR_col / total_total)
NonIR_str <- sprintf("%d (%.1f%%)", NonIR_col, 100 * NonIR_col / total_total)
Total_str <- sprintf("%d (%.1f%%)", total_col, 100 * total_col / total_total)
tibble(
Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
# -----------------------------
# Combine numeric + categorical
# -----------------------------
final_table <- bind_rows(num_summary, cat_summary)
# Description row
description_row <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD OR count (%)",
Non_IR = "Non-IR = mean ± SD OR count (%)",
Total = "Total = mean ± SD OR count (%)",
`p-value` = "t-test or χ²"
)
final_table2 <- bind_rows(description_row, final_table)
# -----------------------------
# Add significance stars
# -----------------------------
add_stars <- function(p) {
p_num <- suppressWarnings(as.numeric(p))
if (grepl("<0.001", p)) return("**<0.001** ***")
if (is.na(p_num)) return(p)
if (p_num < 0.001) return("**<0.001** ***")
if (p_num < 0.01) return(paste0("**", p, "** **"))
if (p_num < 0.05) return(paste0("**", p, "** *"))
return(p)
}
final_table2$`p-value` <- sapply(final_table2$`p-value`, add_stars)
# -----------------------------
# Highlight significant rows
# -----------------------------
p_char <- final_table$`p-value`
p_numeric <- suppressWarnings(as.numeric(gsub("<|\\*|\\s", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1 # +1 for description row
# -----------------------------
# Render table
# -----------------------------
kbl(final_table2, "html",
caption = "Table 1. Comparison of variables by IR status (Study 2)",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value |
|---|---|---|---|---|
| Note: | IR = mean ± SD OR count (%) | Non-IR = mean ± SD OR count (%) | Total = mean ± SD OR count (%) | t-test or χ² |
| Age | 61.27 ± 10.83 | 61.83 ± 9.68 | 61.57 ± 10.14 | 0.839 |
| Weight (kg) | 95.39 ± 19.94 | 75.18 ± 15.58 | 84.56 ± 20.30 | <0.001 *** |
| BMI | 34.31 ± 8.48 | 26.82 ± 4.65 | 30.30 ± 7.64 | <0.001 *** |
| Hip_Circumference | 115.88 ± 19.46 | 101.72 ± 11.46 | 108.29 ± 17.09 | 0.002 ** |
| Waist_Circumference | 110.62 ± 13.45 | 91.81 ± 14.72 | 100.54 ± 16.92 | <0.001 *** |
| WaistHip_Ratio | 0.96 ± 0.08 | 0.90 ± 0.10 | 0.93 ± 0.09 | 0.011 * |
| PulseHR | 67.12 ± 10.84 | 62.73 ± 7.28 | 64.77 ± 9.29 | 0.088 |
| Diastolic_BP1 | 80.35 ± 10.76 | 76.93 ± 13.14 | 78.52 ± 12.11 | 0.290 |
| Diastolic_BP2 | 78.23 ± 12.48 | 75.07 ± 11.10 | 76.54 ± 11.76 | 0.324 |
| Systolic_BP1 | 129.46 ± 17.74 | 132.03 ± 26.06 | 130.84 ± 22.42 | 0.664 |
| Systolic_BP2 | 127.88 ± 19.29 | 131.27 ± 22.23 | 129.70 ± 20.80 | 0.545 |
| Clinical_Site | 26 (46.4%) | 30 (53.6%) | 56 (100.0%) | 0.593 |
### ======================================================================
### LOAD & CLEAN DATA
### ======================================================================
### Load “other” demographic variables
other_vars <- read.csv("~/all_vars.csv") %>%
select(
fh_dm2sb, years_of_education, sualcage, person_id, fh_dm2pt,
sualckncf, suwndosfr, suwnkncf, subrkncf, subrdosfr, sulqdosfr, sulqkncf, pxhic1
) %>%
mutate(across(everything(), ~na_if(., 777)))
other_vars$participant_id <- other_vars$person_id
### Merge with df_final
df_f <- merge(other_vars, df_final, by = "participant_id")
### Load additional datasets
sleep <- read.csv("~/Downloads/sleep_participant_summary.csv")
steps <- read.csv("~/Downloads/steps_participant_summary.csv")
diet <- read.csv("~/Downloads/diet_data.csv") %>%
mutate(across(everything(), ~na_if(., 777)))
hrv <- read.csv("~/Downloads/HRV_Joe.csv") %>%
select(hr_from_ecg, hrv_rmssd, subject_id)
all_stress <- read.csv("~/Downloads/stress_overall_daily.csv")
survey <- read.csv("~/Downloads/survey_data.csv") %>%
select(
person_id,
dmlpor..When.reflecting.on.your.typical.eating.ha,
dmlfrveg..In.a.typical.day..how.many.servings.of.,
dmlsugar..When.reflecting.on.your.typical.eating.,
dmlvex..In.a.typical.week..how.often.do.you.engag,
dmlgrain..In.a.typical.day..how.often.do.you.cons,
dmlhex..How.often.would.you.say.that.you.engage.i,
dmlact..What.would.you.consider.your.typical.acti
) %>%
mutate(across(everything(), ~na_if(., 777)))
### Merge all datasets
df_test <- df_final %>%
left_join(other_vars, by = "participant_id") %>%
left_join(sleep, by = "participant_id") %>%
left_join(steps, by = "participant_id") %>%
left_join(diet, by = c("participant_id" = "person_id")) %>%
left_join(hrv, by = c("participant_id" = "subject_id")) %>%
left_join(all_stress, by = c("participant_id" = "id")) %>%
left_join(survey, by = c("participant_id" = "person_id"))
df_test <- df_test %>% group_by(participant_id) %>% slice(1) %>% ungroup()
### ======================================================================
### RECODE VARIABLES
### ======================================================================
df_test$pxhic1_cat <- factor(
ifelse(df_test$pxhic1 == 1, "yes",
ifelse(df_test$pxhic1 == 0, "no", NA)),
levels = c("no", "yes")
)
df_test$suwnkncf_cat <- factor(
ifelse(df_test$suwnkncf == 1, "yes",
ifelse(df_test$suwnkncf == 0, "no", NA)),
levels = c("no", "yes")
)
df_test$sulqkncf_cat <- factor(
ifelse(df_test$sulqkncf == 1, "yes",
ifelse(df_test$sulqkncf == 0, "no",
ifelse(df_test$sulqkncf == 777, "prefer not answer", NA))),
levels = c("no", "yes", "prefer not answer")
)
df_test$fh_dm2pt_cat <- factor(
ifelse(df_test$fh_dm2pt == 1, "yes",
ifelse(df_test$fh_dm2pt == 0, "no",
ifelse(df_test$fh_dm2pt == 555, "prefer not answer", NA))),
levels = c("no", "yes", "prefer not answer")
)
df_test$fh_dm2sb_cat <- factor(
ifelse(df_test$fh_dm2sb == 1, "yes",
ifelse(df_test$fh_dm2sb == 0, "no",
ifelse(df_test$fh_dm2sb == 555, "prefer not answer", NA))),
levels = c("no", "yes", "prefer not answer")
)
df_test$fh_dm2_combined <- ifelse(
df_test$fh_dm2pt == 555 | df_test$fh_dm2sb == 555, "prefer not answer",
ifelse(df_test$fh_dm2pt == 1 | df_test$fh_dm2sb == 1, "yes",
ifelse(df_test$fh_dm2pt == 0 & df_test$fh_dm2sb == 0, "no", NA))
)
df_test$fh_dm2_combined <- factor(
df_test$fh_dm2_combined,
levels = c("no", "yes", "prefer not answer")
)
### ======================================================================
### DEFINE short_names2 (FULL + COMPLETE)
### ======================================================================
short_names2 <- c(
"pxhic1_cat" = "Insurance",
"sualcage" = "Total years drinking",
"suwndosfr" = "Wine (weekly)",
"suwnkncf" = "Wine in past year",
"sulqkncf" = "Liquor in past year",
"sulqdosfr" = "Liquor (weekly)",
"subrkncf" = "Beer/Ale in past year",
"subrdosfr" = "Beer/Ale (weekly)",
"fh_dm2_combined" = "Family T2D history (any)",
"fh_dm2pt_cat" = "Parent T2D history",
"fh_dm2sb_cat" = "Sibling T2D history",
"years_of_education" = "Years of education"
)
### ======================================================================
### SPLIT INTO TWO VARIABLE SETS
### ======================================================================
vars_demo <- c(
"pxhic1_cat",
"fh_dm2_combined",
"fh_dm2pt_cat",
"fh_dm2sb_cat",
"years_of_education"
)
vars_alcohol <- c(
"sualcage",
"suwndosfr",
"suwnkncf",
"sulqkncf",
"sulqdosfr",
"subrkncf",
"subrdosfr"
)
short_demo <- short_names2[names(short_names2) %in% vars_demo]
short_alcohol <- short_names2[names(short_names2) %in% vars_alcohol]
### ======================================================================
### PLOT PANELS
### (Assumes you already defined plot_ir_variable() earlier)
### ======================================================================
plot_list_demo <- lapply(
vars_demo,
plot_ir_variable,
data = df_test,
short_names = short_demo,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list_demo <- plot_list_demo[!sapply(plot_list_demo, is.null)]
big_panel_demo <- wrap_plots(plot_list_demo, ncol = 5)
big_panel_demo### ======================================================================
### SUMMARY TABLE FUNCTION
### ======================================================================
add_stars <- function(p) {
if (p == "–") return("–")
pnum <- suppressWarnings(as.numeric(gsub("<", "", p)))
if (is.na(pnum)) return(p)
if (pnum < 0.001) return("**<0.001**")
if (pnum < 0.01) return(paste0(p, " **"))
if (pnum < 0.05) return(paste0(p, " *"))
return(p)
}
create_summary_table <- function(var_list, short_names, df) {
num_vars <- var_list[sapply(df[var_list], is.numeric)]
cat_vars <- var_list[sapply(df[var_list], function(x) !is.numeric(x))]
num_summary <- lapply(num_vars, function(var) {
data <- df %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (is.na(p_raw)) "–"
else if (p_raw < 0.001) "<0.001"
else formatC(p_raw, digits = 3, format = "f")
tibble(
Variable = short_names[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary <- lapply(cat_vars, function(var) {
tbl <- table(df[[var]], df$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (is.na(p_raw)) "–"
else if (p_raw < 0.001) "<0.001"
else formatC(p_raw, digits = 3, format = "f")
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_table <- bind_rows(num_summary, cat_summary)
missing_summary <- sapply(var_list, function(v) {
n_miss <- sum(is.na(df[[v]]))
pct <- round(100 * n_miss / nrow(df), 1)
sprintf("%d (%.1f%%)", n_miss, pct)
})
final_table$`Missing (n, %)` <- missing_summary
note_row <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test",
`Missing (n, %)` = ""
)
final_table <- bind_rows(note_row, final_table)
final_table$`p-value` <- sapply(final_table$`p-value`, add_stars)
final_table
}
### ======================================================================
### GENERATE DEMOGRAPHIC TABLE
### ======================================================================
table_demo <- create_summary_table(vars_demo, short_demo, df_test)
# Identify significant rows
p_char <- table_demo$`p-value`[-1] # skip Note row
p_numeric <- suppressWarnings(as.numeric(gsub("[*<]", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1 # +1 for Note row
kbl(table_demo, "html",
caption = "Study 1. Demographic Variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>% # Bold header
row_spec(1, italic = TRUE, background = "#F0F0F0") %>% # Italic Note row with background
column_spec(1, bold = TRUE) %>% # Bold Variable column
column_spec(5, italic = TRUE) %>% # Italic p-value column
row_spec(sig_rows, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | |
| Years of education | 15.13 ± 3.78 | 16.17 ± 3.67 | 15.70 ± 3.74 | 0.105 | 25 (18.1%) |
| Insurance | 13 (24.5%); 47 (88.7%) | 24 (40.0%); 29 (48.3%) | 37 (32.7%); 76 (67.3%) | 0.014 * | 0 (0.0%) |
| Family T2D history (any) | 30 (48.4%); 37 (59.7%); 9 (14.5%) | 18 (23.7%); 28 (36.8%); 16 (21.1%) | 48 (34.8%); 65 (47.1%); 25 (18.1%) | 0.089 | 0 (0.0%) |
| Parent T2D history | 37 (59.7%); 33 (53.2%); 6 (9.7%) | 23 (30.3%); 30 (39.5%); 9 (11.8%) | 60 (43.5%); 63 (45.7%); 15 (10.9%) | 0.270 | 0 (0.0%) |
| Sibling T2D history | 55 (88.7%); 16 (25.8%); 5 (8.1%) | 35 (46.1%); 14 (18.4%); 13 (17.1%) | 90 (65.2%); 30 (21.7%); 18 (13.0%) | 0.034 * | 0 (0.0%) |
### ======================================================================
### STUDY 2: LOAD & CLEAN DATA
### ======================================================================
# Load “other” demographic variables
other_vars2 <- read.csv("~/all_vars.csv") %>%
select(
fh_dm2sb, years_of_education, sualcage, person_id, fh_dm2pt,
sualckncf, suwndosfr, suwnkncf, subrkncf, subrdosfr, sulqdosfr, sulqkncf, pxhic1
) %>%
mutate(across(everything(), ~na_if(., 777)))
other_vars2$participant_id <- other_vars2$person_id
# Merge with df_ucsd_final
df_s2 <- merge(other_vars2, df_ucsd_final, by = "participant_id")
# Load additional datasets (adjust paths as needed)
sleep2 <- read.csv("~/Downloads/sleep_participant_summary.csv")
steps2 <- read.csv("~/Downloads/steps_participant_summary.csv")
diet2 <- read.csv("~/Downloads/diet_data.csv") %>%
mutate(across(everything(), ~na_if(., 777)))
hrv2 <- read.csv("~/Downloads/HRV_Joe.csv") %>%
select(hr_from_ecg, hrv_rmssdc_hr_merged, hrv_rmssd_outlier, ecg_snr, hrv_rmssd, subject_id)
all_stress2 <- read.csv("~/Downloads/stress_overall_daily.csv")
survey2 <- read.csv("~/Downloads/survey_data.csv") %>%
select(
person_id,
dmlpor..When.reflecting.on.your.typical.eating.ha,
dmlfrveg..In.a.typical.day..how.many.servings.of.,
dmlsugar..When.reflecting.on.your.typical.eating.,
dmlvex..In.a.typical.week..how.often.do.you.engag,
dmlgrain..In.a.typical.day..how.often.do.you.cons,
dmlhex..How.often.would.you.say.that.you.engage.i,
dmlact..What.would.you.consider.your.typical.acti
) %>%
mutate(across(everything(), ~na_if(., 777)))
# Merge all datasets
df_s2 <- df_s2 %>%
left_join(sleep2, by = "participant_id") %>%
left_join(steps2, by = "participant_id") %>%
left_join(diet2, by = c("participant_id" = "person_id")) %>%
left_join(hrv2, by = c("participant_id" = "subject_id")) %>%
left_join(all_stress2, by = c("participant_id" = "id")) %>%
left_join(survey2, by = c("participant_id" = "person_id"))
##prob
df_s2 = df_s2 %>% dplyr::group_by(participant_id) %>% slice(1) %>% ungroup()
#56
### ======================================================================
### RECODE VARIABLES
### ======================================================================
# Insurance
df_s2$pxhic1_cat <- factor(
ifelse(df_s2$pxhic1 == 1, "yes",
ifelse(df_s2$pxhic1 == 0, "no", NA)),
levels = c("no", "yes")
)
# Alcohol variables
df_s2$suwndosfr_cat <- factor(
ifelse(df_s2$suwndosfr == 1, "yes",
ifelse(df_s2$suwndosfr == 0, "no", NA)),
levels = c("no", "yes")
)
df_s2$suwnkncf_cat <- factor(
ifelse(df_s2$suwnkncf == 1, "yes",
ifelse(df_s2$suwnkncf == 0, "no", NA)),
levels = c("no", "yes")
)
df_s2$sulqkncf_cat <- factor(
ifelse(df_s2$sulqkncf == 1, "yes",
ifelse(df_s2$sulqkncf == 0, "no",
ifelse(df_s2$sulqkncf == 777, "prefer not answer", NA))),
levels = c("no", "yes", "prefer not answer")
)
# Family history
df_s2$fh_dm2pt_cat <- factor(
ifelse(df_s2$fh_dm2pt == 1, "yes",
ifelse(df_s2$fh_dm2pt == 0, "no",
ifelse(df_s2$fh_dm2pt == 555, "prefer not answer", NA))),
levels = c("no", "yes", "prefer not answer")
)
df_s2$fh_dm2sb_cat <- factor(
ifelse(df_s2$fh_dm2sb == 1, "yes",
ifelse(df_s2$fh_dm2sb == 0, "no",
ifelse(df_s2$fh_dm2sb == 555, "prefer not answer", NA))),
levels = c("no", "yes", "prefer not answer")
)
df_s2$fh_dm2_combined <- ifelse(
df_s2$fh_dm2pt == 555 | df_s2$fh_dm2sb == 555, "prefer not answer",
ifelse(df_s2$fh_dm2pt == 1 | df_s2$fh_dm2sb == 1, "yes",
ifelse(df_s2$fh_dm2pt == 0 & df_s2$fh_dm2sb == 0, "no", NA))
)
df_s2$fh_dm2_combined <- factor(
df_s2$fh_dm2_combined,
levels = c("no", "yes", "prefer not answer")
)
### ======================================================================
### DEFINE SHORT NAMES
### ======================================================================
short_names2 <- c(
"pxhic1_cat" = "Insurance",
"sualcage" = "Total years drinking",
"suwndosfr" = "Wine (weekly)",
"suwnkncf" = "Wine in past year",
"sulqkncf" = "Liquor in past year",
"sulqdosfr" = "Liquor (weekly)",
"subrkncf" = "Beer/Ale in past year",
"subrdosfr" = "Beer/Ale (weekly)",
"fh_dm2_combined" = "Family T2D history (any)",
"fh_dm2pt_cat" = "Parent T2D history",
"fh_dm2sb_cat" = "Sibling T2D history",
"years_of_education" = "Years of education"
)
### ======================================================================
### SELECT VARIABLES FOR DEMOGRAPHIC TABLE
### ======================================================================
vars_demo2 <- c(
"pxhic1_cat",
"fh_dm2_combined",
"fh_dm2pt_cat",
"fh_dm2sb_cat",
"years_of_education"
)
short_demo2 <- short_names2[names(short_names2) %in% vars_demo2]
### ======================================================================
### CREATE SUMMARY TABLE FUNCTION (APA STYLE)
### ======================================================================
create_summary_table2 <- function(var_list, short_names, df) {
num_vars <- var_list[sapply(df[var_list], is.numeric)]
cat_vars <- var_list[sapply(df[var_list], function(x) !is.numeric(x))]
num_summary <- lapply(num_vars, function(var) {
data <- df %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (is.na(p_raw)) "–"
else if (p_raw < 0.001) "<0.001"
else formatC(p_raw, digits = 3, format = "f")
tibble(
Variable = short_names[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary <- lapply(cat_vars, function(var) {
tbl <- table(df[[var]], df$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (is.na(p_raw)) "–"
else if (p_raw < 0.001) "<0.001"
else formatC(p_raw, digits = 3, format = "f")
# Special case for clinical_site
if (var == "clinical_site") {
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["IR"], 100*x["IR"]/total_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["Non-IR"], 100*x["Non-IR"]/total_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
} else {
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["IR"], 100*x["IR"]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["Non-IR"], 100*x["Non-IR"]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
}
tibble(
Variable = short_names[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_table <- bind_rows(num_summary, cat_summary)
missing_summary <- sapply(var_list, function(v) {
n_miss <- sum(is.na(df[[v]]))
pct <- round(100*n_miss/nrow(df),1)
sprintf("%d (%.1f%%)", n_miss, pct)
})
final_table$`Missing (n, %)` <- missing_summary
note_row <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test",
`Missing (n, %)` = ""
)
final_table <- bind_rows(note_row, final_table)
final_table$`p-value` <- sapply(final_table$`p-value`, add_stars)
final_table
}
### ======================================================================
### GENERATE STUDY 2 DEMOGRAPHIC TABLE
### ======================================================================
table_demo2 <- create_summary_table2(vars_demo2, short_demo2, df_s2)
# Identify significant rows
p_char <- table_demo2$`p-value`[-1] # skip Note row
p_numeric <- suppressWarnings(as.numeric(gsub("[*<]", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1# Render APA-style table
# Remove NULLs in case any plots failed
plot_list_demo2 <- lapply(
vars_demo2,
plot_ir_variable,
data = df_s2,
short_names = short_demo2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list_demo2 <- plot_list_demo2[!sapply(plot_list_demo2, is.null)]
# Combine into a single panel
big_panel_demo2 <- wrap_plots(plot_list_demo2, ncol = 5)
big_panel_demo2kbl(table_demo2, "html",
caption = "Study 2. Demographic Variables by IR Status (Study 2)",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | |
| Years of education | 15.54 ± 2.92 | 16.60 ± 2.51 | 16.11 ± 2.73 | 0.154 | 13 (23.2%) |
| Insurance | 10 (47.6%); 11 (52.4%) | 6 (27.3%); 16 (72.7%) | 16 (37.2%); 27 (62.8%) | 0.287 | 0 (0.0%) |
| Family T2D history (any) | 10 (38.5%); 12 (46.2%); 4 (15.4%) | 15 (50.0%); 9 (30.0%); 6 (20.0%) | 25 (44.6%); 21 (37.5%); 10 (17.9%) | 0.461 | 1 (1.8%) |
| Parent T2D history | 12 (48.0%); 10 (40.0%); 3 (12.0%) | 17 (56.7%); 9 (30.0%); 4 (13.3%) | 29 (52.7%); 19 (34.5%); 7 (12.7%) | 0.738 | 0 (0.0%) |
| Sibling T2D history | 15 (57.7%); 9 (34.6%); 2 (7.7%) | 20 (66.7%); 6 (20.0%); 4 (13.3%) | 35 (62.5%); 15 (26.8%); 6 (10.7%) | 0.427 | 0 (0.0%) |
### ======================================================================
### GENERATE ALCOHOL TABLE
### ======================================================================
### ======================================================================
### STUDY 2: PLOT PANELS
### ======================================================================
plot_list_alcohol <- lapply(
vars_alcohol,
plot_ir_variable,
data = df_test,
short_names = short_alcohol,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list_alcohol <- plot_list_alcohol[!sapply(plot_list_alcohol, is.null)]
big_panel_alcohol <- wrap_plots(plot_list_alcohol, ncol = 4)
big_panel_alcoholtable_alcohol <- create_summary_table(vars_alcohol, short_alcohol, df_test)
kbl(table_alcohol, "html",
caption = "Study 1. Alcohol Variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(sig_rows, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | |
| Total years drinking | 19.76 ± 7.22 | 18.70 ± 3.29 | 19.20 ± 5.48 | 0.344 | 31 (22.5%) |
| Wine (weekly) | 0.90 ± 1.40 | 2.97 ± 4.68 | 2.09 ± 3.78 | 0.012 * | 70 (50.7%) |
| Wine in past year | 0.58 ± 0.50 | 0.68 ± 0.47 | 0.64 ± 0.48 | 0.270 | 31 (22.5%) |
| Liquor in past year | 0.70 ± 0.46 | 0.60 ± 0.49 | 0.64 ± 0.48 | 0.266 | 31 (22.5%) |
| Liquor (weekly) | 1.29 ± 2.64 | 2.18 ± 3.24 | 1.72 ± 2.96 | 0.218 | 70 (50.7%) |
| Beer/Ale in past year | 0.50 ± 0.51 | 0.63 ± 0.49 | 0.57 ± 0.50 | 0.174 | 31 (22.5%) |
| Beer/Ale (weekly) | 1.08 ± 2.55 | 3.14 ± 6.78 | 2.30 ± 5.52 | 0.103 | 77 (55.8%) |
vars_alcohol2 <- c(
"sualcage",
"suwndosfr",
"suwnkncf",
"sulqkncf",
"sulqdosfr",
"subrkncf",
"subrdosfr"
)
short_demo2 <- short_names2[names(short_names2) %in% vars_demo2]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]
# -------------------------------------
# Alcohol plots
# -------------------------------------
plot_list_alcohol2 <- lapply(
vars_alcohol2,
plot_ir_variable,
data = df_s2,
short_names = short_alcohol2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list_alcohol2 <- plot_list_alcohol2[!sapply(plot_list_alcohol2, is.null)]
big_panel_alcohol2 <- wrap_plots(plot_list_alcohol2, ncol = 4)
big_panel_alcohol2### ======================================================================
### STUDY 2 — SAFE ALCOHOL VARIABLE LIST
### ======================================================================
vars_alcohol2 <- c(
"sualcage",
"suwndosfr",
"suwnkncf",
"sulqkncf",
"sulqdosfr",
"subrkncf",
"subrdosfr"
)
# Keep ONLY variables that exist in df_s2
vars_alcohol2 <- vars_alcohol2[vars_alcohol2 %in% names(df_s2)]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]
### ======================================================================
### STUDY 2 — ALCOHOL PLOTS
### ======================================================================
plot_list_alcohol2 <- lapply(
vars_alcohol2,
plot_ir_variable,
data = df_s2,
short_names = short_alcohol2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
### ======================================================================
### STUDY 2 ALCOHOL TABLE — FIXED
### ======================================================================
vars_alcohol2 <- c(
"sualcage",
"suwndosfr",
"suwnkncf",
"sulqkncf",
"sulqdosfr",
"subrkncf",
"subrdosfr"
)
# Keep ONLY variables that exist in df_s2
vars_alcohol2 <- vars_alcohol2[vars_alcohol2 %in% names(df_s2)]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]
### ======================================================================
### STUDY 2 — ALCOHOL PLOTS
### ======================================================================
### STUDY 2 ALCOHOL TABLE — FIXED
### ======================================================================
table_alcohol2 <- create_summary_table(vars_alcohol2, short_alcohol2, df_s2)
kbl(table_alcohol2, "html", # <-- FIXED: now using table_alcohol2
caption = "Study 2. Alcohol Variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(sig_rows, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | |
| Total years drinking | 19.32 ± 5.76 | 19.08 ± 3.98 | 19.20 ± 4.90 | 0.865 | 6 (10.7%) |
| Wine (weekly) | 1.42 ± 2.15 | 1.12 ± 1.50 | 1.25 ± 1.78 | 0.692 | 28 (50.0%) |
| Wine in past year | 0.48 ± 0.51 | 0.64 ± 0.49 | 0.56 ± 0.50 | 0.264 | 6 (10.7%) |
| Liquor in past year | 0.60 ± 0.50 | 0.56 ± 0.51 | 0.58 ± 0.50 | 0.780 | 6 (10.7%) |
| Liquor (weekly) | 1.80 ± 3.86 | 0.71 ± 1.33 | 1.28 ± 2.93 | 0.319 | 27 (48.2%) |
| Beer/Ale in past year | 0.56 ± 0.51 | 0.68 ± 0.48 | 0.62 ± 0.49 | 0.392 | 6 (10.7%) |
| Beer/Ale (weekly) | 1.71 ± 3.02 | 0.88 ± 1.96 | 1.26 ± 2.49 | 0.385 | 25 (44.6%) |
# other_vars <- read.csv("~/all_vars.csv") %>%
# select(fh_dm2sb, years_of_education, sualcage, sualckncf, sualcdrv, pxhic1) %>%
# mutate(across(everything(), ~na_if(., 777)))
#
# other_vars$participant_id = other_vars$person_id
# #length(other_vars$participant_id)
#
# df_f = merge(other_vars, df_final,by= "person_id", by.y="participant_id")
# #length(df_f$participant_id) #138
# sleep <- read.csv("~/Downloads/sleep_participant_summary.csv")
# steps <- read.csv("~/Downloads/steps_participant_summary.csv")
# diet <- read.csv("~/Downloads/diet_data.csv") %>%
# mutate(across(everything(), ~na_if(., 777)))
# hrv <- read.csv("~/Downloads/HRV_Joe.csv") %>%
# dplyr::select(hr_from_ecg, hrv_rmssdc_hr_merged, hrv_rmssd_outlier, ecg_snr, hrv_rmssd, subject_id)
# all_stress <- read.csv("~/Downloads/stress_overall_daily.csv")
# survey = read.csv("~/Downloads/survey_data.csv") %>%
# # Select the columns you want
# dplyr::select(
# person_id,
# dmlpor..When.reflecting.on.your.typical.eating.ha,
# dmlfrveg..In.a.typical.day..how.many.servings.of.,
# dmlsugar..When.reflecting.on.your.typical.eating.,
# dmlvex..In.a.typical.week..how.often.do.you.engag,
# dmlgrain..In.a.typical.day..how.often.do.you.cons,
# dmlhex..How.often.would.you.say.that.you.engage.i,
# dmlact..What.would.you.consider.your.typical.acti,
# ) %>%
# # Replace 777 with NA
# dplyr::mutate(across(everything(), ~na_if(., 777)))
#
# #cgm <- read.csv("~/Downloads/cgm_cleaned_daily.csv")
#
# df_test <- df_final %>%
# left_join(other_vars, by = "participant_id") %>%
# left_join(sleep, by = "participant_id") %>%
# left_join(steps, by = "participant_id") %>%
# left_join(diet, by = c("participant_id" = "person_id")) %>%
# left_join(hrv, by = c("participant_id" = "subject_id")) %>%
# left_join(all_stress, by = c("participant_id" = "id")) %>%
# left_join(survey, by = c("participant_id" = "person_id"))
#
# # Keep only one row per participant
# df_test <- df_test %>%
# group_by(participant_id) %>%
# slice(1) %>% # keep the first row per participant
# ungroup()
# Check that each participant appears only once
#table(duplicated(df_test$participant_id)) # should all be FALSE 112| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Vig Exercise Frequency | 3.71 ± 4.04 | 5.66 ± 4.19 | 4.78 ± 4.22 | 0.006 ** | 0 (0%) |
| Typical Activity Level | 5.81 ± 3.75 | 7.43 ± 3.51 | 6.70 ± 3.70 | 0.010 * | 0 (0%) |
| Mean Total Steps | 7357.54 ± 4118.51 | 7549.96 ± 4239.77 | 7463.51 ± 4171.62 | 0.788 | 0 (0%) |
| SD Total Steps | 6239.10 ± 2556.69 | 6669.63 ± 3222.56 | 6476.20 ± 2939.70 | 0.383 | 0 (0%) |
| CV Total Steps | 1.00 ± 0.59 | 0.97 ± 0.32 | 0.98 ± 0.46 | 0.708 | 0 (0%) |
| Mean Active Minutes | 182.29 ± 107.27 | 184.60 ± 87.17 | 183.56 ± 96.36 | 0.892 | 0 (0%) |
| Mean Activity Onset | 2.91 ± 1.13 | 2.90 ± 1.08 | 2.90 ± 1.10 | 0.941 | 0 (0%) |
| SD Activity Onset | 4.07 ± 1.16 | 4.04 ± 0.97 | 4.05 ± 1.05 | 0.863 | 0 (0%) |
| Pct Steps Morning | 16.09 ± 7.48 | 16.54 ± 7.35 | 16.33 ± 7.38 | 0.723 | 0 (0%) |
| Pct Steps Afternoon | 24.82 ± 9.26 | 22.46 ± 6.78 | 23.52 ± 8.05 | 0.097 | 0 (0%) |
| Pct Steps Evening | 18.86 ± 7.03 | 19.02 ± 7.21 | 18.95 ± 7.10 | 0.897 | 0 (0%) |
| Activity Midpoint | 11.24 ± 1.16 | 11.10 ± 1.12 | 11.16 ± 1.14 | 0.484 | 0 (0%) |
| Mean Steps Weekend | 7500.16 ± 4628.36 | 6984.66 ± 4292.10 | 7216.26 ± 4437.28 | 0.503 | 0 (0%) |
| Mean Steps Weekday | 7284.68 ± 4238.04 | 7759.95 ± 4874.57 | 7546.42 ± 4589.29 | 0.541 | 0 (0%) |
### ======================================================================
### STUDY 2 — PHYSICAL ACTIVITY PLOTS + TABLE
### Using df_ucsd_final
### ======================================================================
# ---------------------------------------------------------
# VARIABLES TO PLOT (must match Study 1 structure exactly)
# ---------------------------------------------------------
vars_to_plot_s2 <- c(
"dmlhex..How.often.would.you.say.that.you.engage.",
"dmlvex..In.a.typical.week..how.often.do.you.engag",
"dmlact..What.would.you.consider.your.typical.acti",
"mean_total_steps",
"sd_total_steps",
"cv_total_steps",
"mean_active_minutes",
"mean_activity_onset_h",
"sd_activity_onset_h",
"mean_pct_steps_morning",
"mean_pct_steps_afternoon",
"mean_pct_steps_evening",
"mean_activity_midpoint_h",
"mean_steps_weekend",
"mean_steps_weekday"
)
# Only keep variables that actually exist in df_s2
vars_to_plot_s2 <- vars_to_plot_s2[vars_to_plot_s2 %in% names(df_s2)]
# ---------------------------------------------------------
# SHORT NAMES — FILTER + ENSURE CHARACTER VECTOR
# ---------------------------------------------------------
short_names_s2 <- c(
"dmlhex..How.often.would.you.say.that.you.engage." = "Home Exercise Freq.",
"dmlvex..In.a.typical.week..how.often.do.you.engag" = "Vig. Exercise Freq.",
"dmlact..What.would.you.consider.your.typical.acti" = "Typical Activity",
"mean_total_steps" = "Mean Total Steps",
"sd_total_steps" = "SD Total Steps",
"cv_total_steps" = "CV Total Steps",
"mean_active_minutes" = "Mean Active Min",
"mean_activity_onset_h" = "Mean Activity Onset",
"sd_activity_onset_h" = "SD Activity Onset",
"mean_pct_steps_morning" = "Pct Steps Morning",
"mean_pct_steps_afternoon" = "Pct Steps Afternoon",
"mean_pct_steps_evening" = "Pct Steps Evening",
"mean_activity_midpoint_h" = "Activity Midpoint",
"mean_steps_weekend" = "Mean Steps Wknd",
"mean_steps_weekday" = "Mean Steps Wkday"
)
# Filter + force character vector (fix for earlier Study 2 table error)
short_names_s2 <- unlist(short_names_s2[names(short_names_s2) %in% vars_to_plot_s2])
# ---------------------------------------------------------
# PLOTS (Study 2)
# ---------------------------------------------------------
plot_list_s2 <- lapply(
vars_to_plot_s2,
plot_ir_variable,
data = df_s2,
short_names = short_names_s2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list_s2 <- plot_list_s2[!sapply(plot_list_s2, is.null)]
big_panel_s2 <- wrap_plots(plot_list_s2, ncol = 5)
big_panel_s2### ======================================================================
### STUDY 2 — SUMMARY TABLE
### ======================================================================
num_vars_s2 <- vars_to_plot_s2[sapply(df_s2[vars_to_plot_s2], is.numeric)]
cat_vars_s2 <- vars_to_plot_s2[sapply(df_s2[vars_to_plot_s2], function(x) !is.numeric(x))]
# ---------------------------------------------------------
# Numeric variables summary
# ---------------------------------------------------------
num_summary_s2 <- lapply(num_vars_s2, function(var) {
data <- df_s2 %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names_s2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
# ---------------------------------------------------------
# Categorical variables summary
# ---------------------------------------------------------
cat_summary_s2 <- lapply(cat_vars_s2, function(var) {
tbl <- table(df_s2[[var]], df_s2$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names_s2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
# ---------------------------------------------------------
# Combine tables
# ---------------------------------------------------------
final_table_s2 <- bind_rows(num_summary_s2, cat_summary_s2)
final_table_s2$`p-value` <- sapply(final_table_s2$`p-value`, add_stars)
description_row_s2 <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_table_s2 <- bind_rows(description_row_s2, final_table_s2)
final_table_s2$`p-value` <- sapply(final_table_s2$`p-value`, add_stars)
# ---------------------------------------------------------
# Missingness per variable
# ---------------------------------------------------------
missing_summary_s2 <- sapply(vars_to_plot_s2, function(var) {
n_missing <- sum(is.na(df_s2[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
missing_col_s2 <- c("Number of missing values per variable", missing_summary_s2)
final_table_s2$`Missing (n, %)` <- missing_col_s2
# ---------------------------------------------------------
# Significant rows
# ---------------------------------------------------------
p_char_s2 <- final_table_s2$`p-value`
p_numeric_s2 <- as.numeric(gsub("<", "", p_char_s2))
sig_rows_s2 <- which(!is.na(p_numeric_s2) & p_numeric_s2 < 0.05) + 1
# ---------------------------------------------------------
# Render
# ---------------------------------------------------------
kbl(final_table_s2, "html",
caption = "Study 2. Physical Activity Variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows_s2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Vig. Exercise Freq. | 5.00 ± 4.24 | 6.17 ± 4.49 | 5.62 ± 4.38 | 0.322 | 0 (0%) |
| Typical Activity | 6.35 ± 3.89 | 8.17 ± 3.34 | 7.32 ± 3.69 | 0.068 | 0 (0%) |
| Mean Total Steps | 5199.23 ± 4832.78 | 5652.44 ± 2697.07 | 5442.02 ± 3808.38 | 0.674 | 0 (0%) |
| SD Total Steps | 5619.60 ± 3251.97 | 4783.17 ± 1750.44 | 5170.41 ± 2566.08 | 0.258 | 2 (3.6%) |
| CV Total Steps | 1.24 ± 0.55 | 0.97 ± 0.44 | 1.10 ± 0.51 | 0.056 | 2 (3.6%) |
| Mean Active Min | 135.38 ± 87.68 | 170.73 ± 78.37 | 154.32 ± 83.96 | 0.120 | 0 (0%) |
| Mean Activity Onset | 3.14 ± 1.41 | 3.85 ± 1.80 | 3.53 ± 1.66 | 0.106 | 1 (1.8%) |
| SD Activity Onset | 4.79 ± 1.92 | 4.52 ± 1.33 | 4.64 ± 1.62 | 0.556 | 2 (3.6%) |
| Pct Steps Morning | 16.02 ± 10.02 | 19.47 ± 9.48 | 17.87 ± 9.80 | 0.193 | 0 (0%) |
| Pct Steps Afternoon | 19.23 ± 11.03 | 23.18 ± 7.84 | 21.35 ± 9.57 | 0.135 | 0 (0%) |
| Pct Steps Evening | 14.51 ± 7.04 | 19.77 ± 10.30 | 17.33 ± 9.25 | 0.028 * | 0 (0%) |
| Activity Midpoint | 10.94 ± 1.10 | 11.28 ± 1.28 | 11.13 ± 1.20 | 0.295 | 1 (1.8%) |
| Mean Steps Wknd | 6190.92 ± 6284.24 | 6859.84 ± 4162.77 | 6562.54 ± 5170.32 | 0.656 | 2 (3.6%) |
| Mean Steps Wkday | 5059.24 ± 4699.78 | 5228.52 ± 2367.79 | 5148.50 ± 3624.94 | 0.869 | 1 (1.8%) |
vars_to_plot2 <- c(
"cpd_h",
"sd_onset_clock_h",
"sd_midpoint_clock_h",
"mean_wake_clock_h",
"social_jetlag_h",
"sri",
"sd_wake_clock_h",
"mean_midpoint_weekday_h",
"mean_onset_clock_h",
"mean_midpoint_clock_h",
"mean_midpoint_weekend_h"
)
# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]
short_names2 <- c(
"cpd_h" = "Sleep Chronotype",
"sd_onset_clock_h" = "SD Sleep Onset",
"sd_midpoint_clock_h" = "Sleep Midpoint SD",
"mean_wake_clock_h" = "Mean Wake Time",
"social_jetlag_h" = "Social Jetlag",
"sri" = "Sleep Reg. Index",
"sd_wake_clock_h" = "SD Wake Time",
"mean_midpoint_weekday_h" = "Mean Midpoint Wkday",
"mean_onset_clock_h" = "Sleep Onset",
"mean_midpoint_clock_h" = "M Sleep Midpoint",
"mean_midpoint_weekend_h" = "M Midpoint Wknd"
)
short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
vars_to_plot2,
plot_ir_variable,
data = df_test,
short_names = short_names2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]
num_summary2 <- lapply(num_vars2, function(var) {
data <- df_test %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary2 <- lapply(cat_vars2, function(var) {
tbl <- table(df_test[[var]], df_test$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_tableB2 <- bind_rows(num_summary2, cat_summary2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
description_rowB <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
p_char2 <- final_tableB2$`p-value`
p_numeric2 <- as.numeric(gsub("<", "", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1
# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
n_missing <- sum(is.na(df_test[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_test), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)
# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col
# Render the table
kbl(final_tableB2, "html",
caption = "Study 1. Sleep variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Sleep Chronotype | 1.48 ± 1.22 | 1.22 ± 0.98 | 1.34 ± 1.10 | 0.173 | 5 (3.6%) |
| SD Sleep Onset | 1.88 ± 1.22 | 1.68 ± 1.25 | 1.77 ± 1.24 | 0.348 | 5 (3.6%) |
| Sleep Midpoint SD | 2.01 ± 1.58 | 1.63 ± 1.28 | 1.80 ± 1.43 | 0.136 | 5 (3.6%) |
| Mean Wake Time | 30.16 ± 2.14 | 30.20 ± 2.09 | 30.18 ± 2.11 | 0.893 | 0 (0%) |
| Social Jetlag | 1.48 ± 1.56 | 1.17 ± 1.39 | 1.31 ± 1.47 | 0.244 | 7 (5.1%) |
| Sleep Reg. Index | 29.21 ± 23.98 | 36.35 ± 24.88 | 33.13 ± 24.65 | 0.098 | 7 (5.1%) |
| SD Wake Time | 3.21 ± 3.05 | 2.65 ± 2.72 | 2.90 ± 2.87 | 0.278 | 5 (3.6%) |
| Mean Midpoint Wkday | 27.00 ± 1.56 | 26.83 ± 1.22 | 26.90 ± 1.37 | 0.500 | 7 (5.1%) |
| Sleep Onset | 23.64 ± 1.53 | 23.38 ± 1.48 | 23.49 ± 1.50 | 0.317 | 0 (0%) |
| M Sleep Midpoint | 26.90 ± 1.38 | 26.79 ± 1.25 | 26.84 ± 1.31 | 0.643 | 0 (0%) |
| M Midpoint Wknd | 26.54 ± 2.26 | 26.58 ± 2.05 | 26.56 ± 2.14 | 0.925 | 7 (5.1%) |
# ----------------------------
# STUDY 2
# ----------------------------
vars_to_plot2_s2 <- c(
"cpd_h",
"sd_onset_clock_h",
"sd_midpoint_clock_h",
"mean_wake_clock_h",
"social_jetlag_h",
"sri",
"sd_wake_clock_h",
"mean_midpoint_weekday_h",
"mean_onset_clock_h",
"mean_midpoint_clock_h",
"mean_midpoint_weekend_h"
)
# Keep only the variables that exist
vars_to_plot2_s2 <- vars_to_plot2_s2[vars_to_plot2_s2 %in% names(df_s2)]
short_names2_s2 <- c(
"cpd_h" = "Sleep Chronotype",
"sd_onset_clock_h" = "SD Sleep Onset",
"sd_midpoint_clock_h" = "Sleep Midpoint SD",
"mean_wake_clock_h" = "Mean Wake Time",
"social_jetlag_h" = "Social Jetlag",
"sri" = "Sleep Reg. Index",
"sd_wake_clock_h" = "SD WakeTime",
"mean_midpoint_weekday_h" = "M Midpoint Weekday",
"mean_onset_clock_h" = "Sleep Onset",
"mean_midpoint_clock_h" = "M Sleep Midpoint",
"mean_midpoint_weekend_h" = "M Midpoint Weekend"
)
short_names2_s2 <- short_names2_s2[names(short_names2_s2) %in% vars_to_plot2_s2]
# ----------------------------
# PLOTS FOR STUDY 2
# ----------------------------
plot_list2_s2 <- lapply(
vars_to_plot2_s2,
plot_ir_variable,
data = df_s2,
short_names = short_names2_s2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list2_s2 <- plot_list2_s2[!sapply(plot_list2_s2, is.null)]
big_panel2_s2 <- wrap_plots(plot_list2_s2, ncol = 4)
big_panel2_s2# ----------------------------
# SUMMARY TABLE FOR STUDY 2
# ----------------------------
num_vars2_s2 <- vars_to_plot2_s2[sapply(df_s2[vars_to_plot2_s2], is.numeric)]
cat_vars2_s2 <- vars_to_plot2_s2[sapply(df_s2[vars_to_plot2_s2], function(x) !is.numeric(x))]
num_summary2_s2 <- lapply(num_vars2_s2, function(var) {
data <- df_s2 %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names2_s2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary2_s2 <- lapply(cat_vars2_s2, function(var) {
tbl <- table(df_s2[[var]], df_s2$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names2_s2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_tableB2_s2 <- bind_rows(num_summary2_s2, cat_summary2_s2)
final_tableB2_s2$`p-value` <- sapply(final_tableB2_s2$`p-value`, add_stars)
# Add note row
description_rowB_s2 <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_tableB2_s2 <- bind_rows(description_rowB_s2, final_tableB2_s2)
final_tableB2_s2$`p-value` <- sapply(final_tableB2_s2$`p-value`, add_stars)
# Identify significant rows
p_char2_s2 <- final_tableB2_s2$`p-value`
p_numeric2_s2 <- as.numeric(gsub("<", "", p_char2_s2))
sig_rows2_s2 <- which(!is.na(p_numeric2_s2) & p_numeric2_s2 < 0.05) + 1
# ----------------------------
# Missingness per variable
# ----------------------------
missing_summary_s2 <- sapply(vars_to_plot2_s2, function(var) {
n_missing <- sum(is.na(df_s2[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
missing_col_s2 <- c("Number of missing values per variable", missing_summary_s2)
final_tableB2_s2$`Missing (n, %)` <- missing_col_s2
# ----------------------------
# Render Table for Study 2
# ----------------------------
kbl(final_tableB2_s2, "html",
caption = "Study 2. Sleep Variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows2_s2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Sleep Chronotype | 1.09 ± 0.57 | 1.08 ± 1.04 | 1.09 ± 0.86 | 0.954 | 5 (8.9%) |
| SD Sleep Onset | 1.66 ± 0.99 | 1.40 ± 0.96 | 1.51 ± 0.97 | 0.351 | 5 (8.9%) |
| Sleep Midpoint SD | 1.48 ± 0.81 | 1.51 ± 1.41 | 1.49 ± 1.18 | 0.936 | 5 (8.9%) |
| Mean Wake Time | 30.84 ± 1.19 | 30.43 ± 1.36 | 30.62 ± 1.29 | 0.245 | 2 (3.6%) |
| Social Jetlag | 0.77 ± 0.61 | 1.06 ± 1.12 | 0.94 ± 0.94 | 0.238 | 6 (10.7%) |
| Sleep Reg. Index | 30.87 ± 26.56 | 37.15 ± 21.35 | 34.33 ± 23.78 | 0.374 | 7 (12.5%) |
| SD WakeTime | 1.90 ± 1.04 | 2.23 ± 2.86 | 2.09 ± 2.25 | 0.565 | 5 (8.9%) |
| M Midpoint Weekday | 26.99 ± 1.35 | 27.00 ± 0.86 | 26.99 ± 1.08 | 0.980 | 6 (10.7%) |
| Sleep Onset | 23.48 ± 1.85 | 23.35 ± 1.20 | 23.41 ± 1.52 | 0.759 | 2 (3.6%) |
| M Sleep Midpoint | 27.16 ± 1.32 | 26.89 ± 0.73 | 27.01 ± 1.04 | 0.368 | 2 (3.6%) |
| M Midpoint Weekend | 27.38 ± 1.62 | 26.89 ± 1.24 | 27.10 ± 1.42 | 0.256 | 6 (10.7%) |
# -----------------------------
# Step 0: Define diet variables
# -----------------------------
diet_vars <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo", # Fast food
"diet2..How.many.servings.of.fruit.did.you.eat.eac", # Fruit
"diet3..How.many.servings.of.vegetables.did.you.ea", # Vegetables
"diet4..How.many.regular.sodas.or.glasses.of.sweet", # Soda
"diet5..How.many.times.a.week.did.you.eat.beans..l", # Beans/Chicken/Fish
"diet6..How.many.times.a.week.did.you.eat.regular.", # Salty Snacks
"diet7..How.many.times.a.week.did.you.eat.desserts", # Desserts
"diet8..How.much.margarine..butter..or.meat.fat.do", # Butter & Meat
"diet9..How.many.servings.of.fruit.juice.did.you.d" # Fruit Juice
)
# -----------------------------
# Step 1: Identify unhealthy items
# -----------------------------
unhealthy_vars <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo",
"diet4..How.many.regular.sodas.or.glasses.of.sweet",
"diet6..How.many.times.a.week.did.you.eat.regular.",
"diet7..How.many.times.a.week.did.you.eat.desserts",
"diet8..How.much.margarine..butter..or.meat.fat.do",
"diet9..How.many.servings.of.fruit.juice.did.you.d"
)
healthy_vars <- setdiff(diet_vars, unhealthy_vars)
# -----------------------------
# Step 2: Reverse code unhealthy items (0,1,2 → 2,1,0)
# -----------------------------
df_test[unhealthy_vars] <- 2 - df_test[unhealthy_vars]
# -----------------------------
# Step 3: Compute total Diet Score
# -----------------------------
df_test$diet_score <- rowSums(df_test[diet_vars], na.rm = TRUE)
df_test$diet_unhealthy_score <- rowSums(df_test[unhealthy_vars], na.rm = TRUE)
# Optional: scale 0–100
max_score <- length(diet_vars) * 2
df_test$diet_score_0_100 <- df_test$diet_score / max_score * 100
# -----------------------------
# Step 4: Check results
# -----------------------------
#head(df_test[, c(diet_vars, "diet_score", "diet_score_0_100")])
vars_to_plot2 <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo",
"diet3..How.many.servings.of.vegetables.did.you.ea",
"diet2..How.many.servings.of.fruit.did.you.eat.eac",
"diet_score_0_100",
"diet4..How.many.regular.sodas.or.glasses.of.sweet",
"diet5..How.many.times.a.week.did.you.eat.beans..l",
"diet7..How.many.times.a.week.did.you.eat.desserts",
"diet8..How.much.margarine..butter..or.meat.fat.do",
"diet9..How.many.servings.of.fruit.juice.did.you.d",
"diet6..How.many.times.a.week.did.you.eat.regular.",
"dmlpor..When.reflecting.on.your.typical.eating.ha",
"dmlfrveg..In.a.typical.day..how.many.servings.of.",
"dmlgrain..In.a.typical.day..how.often.do.you.cons",
"dmlsugar..When.reflecting.on.your.typical.eating.")
# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]
short_names2 <- c(
"diet_score_0_100" = "Overall Diet Score",
"diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
"diet3..How.many.servings.of.vegetables.did.you.ea" = "Weekly Vegetables",
"diet2..How.many.servings.of.fruit.did.you.eat.eac" = "Weekly Fruit",
"diet4..How.many.regular.sodas.or.glasses.of.sweet" = "Weekly Soda",
"diet5..How.many.times.a.week.did.you.eat.beans..l"= "Weekly Beans",
"diet7..How.many.times.a.week.did.you.eat.desserts" = "Weekly Desserts",
"diet8..How.much.margarine..butter..or.meat.fat.do" = "Weekly Butter & Meat",
"diet9..How.many.servings.of.fruit.juice.did.you.d" = "Weekly Fruit juice",
"diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
"dmlpor..When.reflecting.on.your.typical.eating.ha" = "Typical Portion",
"dmlfrveg..In.a.typical.day..how.many.servings.of." = "Fruit & Veg Intake",
"dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
"dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake")
short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
vars_to_plot2,
plot_ir_variable, # <- same function you already created
data = df_test,
short_names = short_names2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 6)
big_panel2num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]
num_summary2 <- lapply(num_vars2, function(var) {
data <- df_test %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary2 <- lapply(cat_vars2, function(var) {
tbl <- table(df_test[[var]], df_test$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_tableB2 <- bind_rows(num_summary2, cat_summary2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
description_rowB <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
p_char2 <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1
# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
n_missing <- sum(is.na(df_test[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_test), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)
# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col
# Render the table
kbl(final_tableB2, "html",
caption = "Study 1. Diet variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Weekly Fastfood | 1.22 ± 0.75 | 1.30 ± 0.67 | 1.27 ± 0.70 | 0.551 | 10 (7.2%) |
| Weekly Vegetables | 1.47 ± 0.71 | 1.40 ± 0.65 | 1.43 ± 0.67 | 0.588 | 10 (7.2%) |
| Weekly Fruit | 1.74 ± 0.52 | 1.57 ± 0.61 | 1.64 ± 0.57 | 0.089 | 12 (8.7%) |
| Overall Diet Score | 66.04 ± 21.10 | 60.82 ± 22.44 | 63.16 ± 21.93 | 0.162 | 0 (0%) |
| Weekly Soda | 1.62 ± 0.64 | 1.66 ± 0.63 | 1.64 ± 0.64 | 0.749 | 10 (7.2%) |
| Weekly Beans | 0.66 ± 0.74 | 0.51 ± 0.65 | 0.58 ± 0.69 | 0.260 | 10 (7.2%) |
| Weekly Desserts | 1.45 ± 0.65 | 1.21 ± 0.72 | 1.32 ± 0.70 | 0.056 | 10 (7.2%) |
| Weekly Butter & Meat | 1.21 ± 0.59 | 1.26 ± 0.61 | 1.23 ± 0.60 | 0.635 | 10 (7.2%) |
| Weekly Fruit juice | 1.83 ± 0.46 | 1.75 ± 0.47 | 1.79 ± 0.47 | 0.374 | 11 (8%) |
| Regular Salty Snacks per Week | 1.55 ± 0.63 | 1.27 ± 0.70 | 1.40 ± 0.68 | 0.018 * | 10 (7.2%) |
| Typical Portion | 6.67 ± 3.01 | 6.64 ± 2.87 | 6.65 ± 2.92 | 0.966 | 2 (1.4%) |
| Fruit & Veg Intake | 5.40 ± 3.04 | 5.72 ± 3.02 | 5.58 ± 3.02 | 0.538 | 0 (0%) |
| Typical Grain Intake | 5.65 ± 4.20 | 5.53 ± 3.88 | 5.58 ± 4.01 | 0.864 | 0 (0%) |
| Typical Sugar Intake | 4.44 ± 3.15 | 4.41 ± 3.36 | 4.42 ± 3.26 | 0.960 | 0 (0%) |
# -----------------------------
# Step 0: Define diet variables
# -----------------------------
diet_vars <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo", # Fast food
"diet2..How.many.servings.of.fruit.did.you.eat.eac", # Fruit
"diet3..How.many.servings.of.vegetables.did.you.ea", # Vegetables
"diet4..How.many.regular.sodas.or.glasses.of.sweet", # Soda
"diet5..How.many.times.a.week.did.you.eat.beans..l", # Beans/Chicken/Fish
"diet6..How.many.times.a.week.did.you.eat.regular.", # Salty Snacks
"diet7..How.many.times.a.week.did.you.eat.desserts", # Desserts
"diet8..How.much.margarine..butter..or.meat.fat.do", # Butter & Meat
"diet9..How.many.servings.of.fruit.juice.did.you.d") # Fruit Juice )
# -----------------------------
# Step 1: Identify unhealthy items
# -----------------------------
unhealthy_vars <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo",
"diet4..How.many.regular.sodas.or.glasses.of.sweet",
"diet6..How.many.times.a.week.did.you.eat.regular.",
"diet7..How.many.times.a.week.did.you.eat.desserts",
"diet8..How.much.margarine..butter..or.meat.fat.do",
"diet9..How.many.servings.of.fruit.juice.did.you.d")
healthy_vars <- setdiff(diet_vars, unhealthy_vars)
# -----------------------------
# Step 2: Reverse code unhealthy items (0,1,2 → 2,1,0)
# -----------------------------
df_s2[unhealthy_vars] <- 2 - df_s2[unhealthy_vars]
# -----------------------------
# Step 3: Compute total Diet Score
# -----------------------------
df_s2$diet_score <- rowSums(df_s2[diet_vars], na.rm = TRUE)
df_s2$diet_unhealthy_score <- rowSums(df_s2[unhealthy_vars], na.rm = TRUE)
# Optional: scale 0–100
max_score <- length(diet_vars) * 2
df_s2$diet_score_0_100 <- df_s2$diet_score / max_score * 100
# -----------------------------
# Step 4: Variables to plot
# -----------------------------
vars_to_plot2 <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo",
"diet3..How.many.servings.of.vegetables.did.you.ea",
"diet2..How.many.servings.of.fruit.did.you.eat.eac",
"diet_score_0_100",
"diet4..How.many.regular.sodas.or.glasses.of.sweet",
"diet5..How.many.times.a.week.did.you.eat.beans..l",
"diet7..How.many.times.a.week.did.you.eat.desserts",
"diet8..How.much.margarine..butter..or.meat.fat.do",
"diet9..How.many.servings.of.fruit.juice.did.you.d",
"diet6..How.many.times.a.week.did.you.eat.regular.",
"dmlpor..When.reflecting.on.your.typical.eating.ha",
"dmlfrveg..In.a.typical.day..how.many.servings.of.",
"dmlgrain..In.a.typical.day..how.often.do.you.cons",
"dmlsugar..When.reflecting.on.your.typical.eating."
)
# Keep only variables that exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_s2)]
short_names2 <- c(
"diet_score_0_100" = "Overall Diet Score",
"diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
"diet3..How.many.servings.of.vegetables.did.you.ea" = "Weekly Vegetables",
"diet2..How.many.servings.of.fruit.did.you.eat.eac" = "Weekly Fruit",
"diet4..How.many.regular.sodas.or.glasses.of.sweet" = "Weekly Soda",
"diet5..How.many.times.a.week.did.you.eat.beans..l"= "Weekly Beans",
"diet7..How.many.times.a.week.did.you.eat.desserts" = "Weekly Desserts",
"diet8..How.much.margarine..butter..or.meat.fat.do" = "Weekly Butter & Meat",
"diet9..How.many.servings.of.fruit.juice.did.you.d" = "Weekly Fruit juice",
"diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
"dmlpor..When.reflecting.on.your.typical.eating.ha" = "Typical Portion",
"dmlfrveg..In.a.typical.day..how.many.servings.of." = "Fruit & Veg Intake",
"dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
"dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake"
)
short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
# -----------------------------
# Step 5: Generate plots
# -----------------------------
plot_list2 <- lapply(
vars_to_plot2,
plot_ir_variable,
data = df_s2,
short_names = short_names2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 6)
big_panel2# -----------------------------
# Step 6: Summarize numeric vs categorical
# -----------------------------
num_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], function(x) !is.numeric(x))]
# Numeric summaries
num_summary2 <- lapply(num_vars2, function(var) {
data <- df_s2 %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
# Categorical summaries
cat_summary2 <- lapply(cat_vars2, function(var) {
tbl <- table(df_s2[[var]], df_s2$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_tableB2 <- bind_rows(num_summary2, cat_summary2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
# Add description row
description_rowB <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
# Identify significant rows
p_char2 <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1
# Compute missing counts per variable
missing_summary <- sapply(vars_to_plot2, function(var) {
n_missing <- sum(is.na(df_s2[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
missing_col <- c("Number of missing values per variable", missing_summary)
final_tableB2$`Missing (n, %)` <- missing_col
# Render the table
kbl(final_tableB2, "html",
caption = "Study 2. Diet Variables by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Weekly Fastfood | 1.44 ± 0.51 | 1.60 ± 0.58 | 1.52 ± 0.54 | 0.303 | 6 (10.7%) |
| Weekly Vegetables | 1.60 ± 0.50 | 1.40 ± 0.58 | 1.50 ± 0.54 | 0.197 | 6 (10.7%) |
| Weekly Fruit | 1.56 ± 0.58 | 1.75 ± 0.53 | 1.65 ± 0.56 | 0.239 | 7 (12.5%) |
| Overall Diet Score | 66.67 ± 17.43 | 62.59 ± 29.68 | 64.48 ± 24.63 | 0.528 | 0 (0%) |
| Weekly Soda | 1.68 ± 0.56 | 1.88 ± 0.33 | 1.78 ± 0.46 | 0.131 | 6 (10.7%) |
| Weekly Beans | 0.68 ± 0.69 | 0.40 ± 0.58 | 0.54 ± 0.65 | 0.127 | 6 (10.7%) |
| Weekly Desserts | 1.20 ± 0.76 | 1.52 ± 0.65 | 1.36 ± 0.72 | 0.118 | 6 (10.7%) |
| Weekly Butter & Meat | 1.32 ± 0.48 | 1.48 ± 0.59 | 1.40 ± 0.53 | 0.295 | 6 (10.7%) |
| Weekly Fruit juice | 1.52 ± 0.65 | 1.88 ± 0.33 | 1.70 ± 0.54 | 0.019 * | 6 (10.7%) |
| Regular Salty Snacks per Week | 1.48 ± 0.65 | 1.68 ± 0.48 | 1.58 ± 0.57 | 0.223 | 6 (10.7%) |
| Typical Portion | 6.15 ± 2.94 | 7.33 ± 3.14 | 6.79 ± 3.08 | 0.153 | 0 (0%) |
| Fruit & Veg Intake | 5.77 ± 2.32 | 5.52 ± 2.79 | 5.64 ± 2.56 | 0.716 | 1 (1.8%) |
| Typical Grain Intake | 7.31 ± 3.80 | 7.17 ± 3.13 | 7.23 ± 3.43 | 0.881 | 0 (0%) |
| Typical Sugar Intake | 3.65 ± 3.02 | 5.17 ± 3.07 | 4.46 ± 3.12 | 0.069 | 0 (0%) |
# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]
df_test$hrv_rmssd_clean <- pmin(
pmax(df_test$hrv_rmssd, 5), # lower bound
100 # upper bound
)
num_changed_rmssd <- sum(df_test$hrv_rmssd_clean != df_test$hrv_rmssd, na.rm = TRUE)
vars_to_plot2 <- c(
"hr_from_ecg",
"hrv_rmssd_clean",
"stress_overall_daily_mean_stress")
short_names2 <- c(
"hr_from_ecg" = "Resting HR (ECG)",
"hrv_rmssd_clean" = "HRV RMSSD (ECG)",
"stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)")
short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
vars_to_plot2,
plot_ir_variable, # <- same function you already created
data = df_test,
short_names = short_names2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]
num_summary2 <- lapply(num_vars2, function(var) {
data <- df_test %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary2 <- lapply(cat_vars2, function(var) {
tbl <- table(df_test[[var]], df_test$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_tableB2 <- bind_rows(num_summary2, cat_summary2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
description_rowB <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
p_char2 <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1
# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
n_missing <- sum(is.na(df_test[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_test), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)
# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col
# Render the table
kbl(final_tableB2, "html",
caption = "Study 1. HR/HRV by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Resting HR (ECG) | 69.03 ± 11.47 | 64.99 ± 11.84 | 66.83 ± 11.80 | 0.049 * | 6 (4.3%) |
| HRV RMSSD (ECG) | 32.56 ± 26.10 | 35.84 ± 26.35 | 34.35 ± 26.19 | 0.476 | 6 (4.3%) |
| Daily Stress score (Garmin) | 27.76 ± 13.97 | 24.34 ± 13.48 | 25.88 ± 13.76 | 0.149 | 0 (0%) |
# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_s2)]
df_s2$hrv_rmssd_clean <- pmin(
pmax(df_s2$hrv_rmssd, 5), # lower bound
100 # upper bound
)
num_changed_rmssd <- sum(df_s2$hrv_rmssd_clean != df_s2$hrv_rmssd, na.rm = TRUE)
vars_to_plot2 <- c(
"hr_from_ecg",
"hrv_rmssd_clean",
"stress_overall_daily_mean_stress"
)
short_names2 <- c(
"hr_from_ecg" = "Resting HR (ECG)",
"hrv_rmssd_clean" = "HRV RMSSD (ECG)",
"stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)"
)
short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
vars_to_plot2,
plot_ir_variable, # <- same function you already created
data = df_s2,
short_names = short_names2,
fill_colors = fill_colors,
edge_colors = edge_colors
)
plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2num_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], function(x) !is.numeric(x))]
num_summary2 <- lapply(num_vars2, function(var) {
data <- df_s2 %>% filter(!is.na(.data[[var]]))
IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
Total_vals <- data[[var]]
t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
p_raw <- if (!is.null(t_res)) t_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
tibble(
Variable = short_names2[[var]],
IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
`p-value` = p_val
)
}) %>% bind_rows()
cat_summary2 <- lapply(cat_vars2, function(var) {
tbl <- table(df_s2[[var]], df_s2$IR_label)
IR_total <- sum(tbl[, "IR"])
NonIR_total <- sum(tbl[, "Non-IR"])
total_total <- sum(tbl)
chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
p_val <- if (!is.na(p_raw)) {
if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
} else "–"
IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
tibble(
Variable = short_names2[[var]],
IR = paste(IR_str, collapse = "; "),
Non_IR = paste(NonIR_str, collapse = "; "),
Total = paste(Total_str, collapse = "; "),
`p-value` = p_val
)
}) %>% bind_rows()
final_tableB2 <- bind_rows(num_summary2, cat_summary2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
description_rowB <- tibble(
Variable = "Note:",
IR = "IR = mean ± SD or count (%)",
Non_IR = "Non-IR = mean ± SD or count (%)",
Total = "Total = mean ± SD or count (%)",
`p-value` = "t-test or χ² test"
)
final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)
p_char2 <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1
# Compute missing counts per variable
missing_summary <- sapply(vars_to_plot2, function(var) {
n_missing <- sum(is.na(df_s2[[var]]))
pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
paste0(n_missing, " (", pct_missing, "%)")
})
missing_col <- c("Number of missing values per variable", missing_summary)
final_tableB2$`Missing (n, %)` <- missing_col
# Render the table
kbl(final_tableB2, "html",
caption = "Study 2. HR/HRV by IR Status",
escape = FALSE) %>%
kable_styling(full_width = TRUE, position = "center") %>%
row_spec(0, bold = TRUE) %>%
row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
column_spec(1, bold = TRUE) %>%
column_spec(5, italic = TRUE) %>%
row_spec(sig_rows2, background = "#F5FBFF")| Variable | IR | Non_IR | Total | p-value | Missing (n, %) |
|---|---|---|---|---|---|
| Note: | IR = mean ± SD or count (%) | Non-IR = mean ± SD or count (%) | Total = mean ± SD or count (%) | t-test or χ² test | Number of missing values per variable |
| Resting HR (ECG) | 64.38 ± 11.94 | 56.83 ± 6.07 | 60.40 ± 9.98 | 0.006 ** | 1 (1.8%) |
| HRV RMSSD (ECG) | 33.94 ± 31.56 | 30.25 ± 23.12 | 32.00 ± 27.24 | 0.626 | 1 (1.8%) |
| Daily Stress score (Garmin) | 19.44 ± 13.86 | 22.33 ± 11.69 | 20.96 ± 12.73 | 0.410 | 1 (1.8%) |
library(tidyverse)
library(knitr)
library(kableExtra)
# -------------------------------
# 1. Define subset of features
# -------------------------------
X <- df_test %>%
dplyr::select(
fh_dm2sb_cat,
years_of_education,
age,
bmi_vsorres..BMI,
whr_vsorres..Waist.to.Hip.Ratio..WHR.,
pulse_vsorres..Heart.Rate..bpm.,
sualcage,
sualckncf,
sri,
mean_pct_steps_morning,
mean_pct_steps_evening,
dmlhex..How.often.would.you.say.that.you.engage.i,
dmlvex..In.a.typical.week..how.often.do.you.engag,
hr_from_ecg,
mean_active_minutes,
diet6..How.many.times.a.week.did.you.eat.regular.,
diet1..How.many.times.a.week.did.you.eat.fast.foo,
dmlgrain..In.a.typical.day..how.often.do.you.cons,
dmlsugar..When.reflecting.on.your.typical.eating.,
cv_total_steps,
stress_overall_daily_mean_stress
) %>%
mutate(across(where(is.factor), ~ factor(trimws(.))))
selected_features <- colnames(X)
# -------------------------------
# 2. Feature categories
# -------------------------------
feature_categories <- tibble(
Feature = selected_features,
Category = c(
"Demographics", # fh_dm2sb_cat
"Demographics", # years_of_education
"Demographics", # age
"Anthropometrics & Vitals", # bmi_vsorres..BMI
"Anthropometrics & Vitals", # whr_vsorres..Waist.to.Hip.Ratio..WHR.
"Anthropometrics & Vitals", # pulse_vsorres..Heart.Rate..bpm.
"Alcohol / Substance Use", # sualcage
"Alcohol / Substance Use", # sualckncf
"Sleep", # sri
"Physical Activity", # mean_pct_steps_morning
"Physical Activity", # mean_pct_steps_evening
"Physical Activity", # dmlhex..How.often.would.you.say.that.you.engage.i
"Physical Activity", # dmlvex..In.a.typical.week..how.often.do.you.engag
"ECG / Heart Rate & HRV", # hr_from_ecg
"Physical Activity", # mean_active_minutes
"Nutrition", # diet6..How.many.times.a.week.did.you.eat.regular.
"Nutrition", # diet1..How.many.times.a.week.did.you.eat.fast.foo
"Nutrition", # dmlgrain..In.a.typical.day..how.often.do.you.cons
"Nutrition", # dmlsugar..When.reflecting.on.your.typical.eating.
"Physical Activity", # cv_total_steps
"Stress" # stress_overall_daily_mean_stress
)
)
# -------------------------------
# 3. Short names dictionary
# -------------------------------
short_names_all <- c(
"bmi_vsorres..BMI" = "BMI",
"whr_vsorres..Waist.to.Hip.Ratio..WHR." = "WaistHip_Ratio",
"pulse_vsorres..Heart.Rate..bpm." = "PulseHR",
"age" = "Age",
"fh_dm2sb_cat" = "Sibling T2D history",
"years_of_education" = "Years of education",
"sualcage" = "Total years drinking",
"sualckncf" = "Wine in past year",
"sri" = "Sleep Reg. Index",
"mean_pct_steps_morning" = "Pct Steps Morning",
"mean_pct_steps_evening" = "Pct Steps Evening",
"dmlhex..How.often.would.you.say.that.you.engage.i" = "Home Exercise Frequency",
"dmlvex..In.a.typical.week..how.often.do.you.engag" = "Vig Exercise Frequency",
"hr_from_ecg" = "Resting HR (ECG)",
"mean_active_minutes" = "Mean Active Minutes",
"diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
"diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
"dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
"dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake",
"cv_total_steps" = "CV Total Steps",
"stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)"
)
short_names_tbl <- tibble(
Feature = names(short_names_all),
Short_Name = unname(short_names_all)
)
# -------------------------------
# 4. Combine category + name (2 columns)
# -------------------------------
final_table <- feature_categories %>%
left_join(short_names_tbl, by = "Feature") %>%
mutate(`Feature / Short Name` = coalesce(Short_Name, Feature)) %>%
select(Category, `Feature / Short Name`) %>%
arrange(Category, `Feature / Short Name`)
# -------------------------------
# 5. Render 2-column table
# -------------------------------
kable(
final_table,
caption = "Selected Features by Category",
col.names = c("Category", "Feature / Short Name")
) %>%
kable_classic(full_width = TRUE)| Category | Feature / Short Name |
|---|---|
| Alcohol / Substance Use | Total years drinking |
| Alcohol / Substance Use | Wine in past year |
| Anthropometrics & Vitals | BMI |
| Anthropometrics & Vitals | PulseHR |
| Anthropometrics & Vitals | WaistHip_Ratio |
| Demographics | Age |
| Demographics | Sibling T2D history |
| Demographics | Years of education |
| ECG / Heart Rate & HRV | Resting HR (ECG) |
| Nutrition | Regular Salty Snacks per Week |
| Nutrition | Typical Grain Intake |
| Nutrition | Typical Sugar Intake |
| Nutrition | Weekly Fastfood |
| Physical Activity | CV Total Steps |
| Physical Activity | Home Exercise Frequency |
| Physical Activity | Mean Active Minutes |
| Physical Activity | Pct Steps Evening |
| Physical Activity | Pct Steps Morning |
| Physical Activity | Vig Exercise Frequency |
| Sleep | Sleep Reg. Index |
| Stress | Daily Stress score (Garmin) |
library(ggplot2)
library(viridis)
# Create feature counts table
feature_counts <- feature_categories %>%
group_by(Category) %>%
summarise(Count = n()) %>%
ungroup() %>%
mutate(Percent = round(100 * Count / sum(Count), 1))
# Wide horizontal stacked bar plot
ggplot(feature_counts, aes(x = 1, y = Count, fill = Category)) +
geom_bar(stat = "identity", width = 0.6, color = "white") +
geom_text(aes(label = paste0(Count, " (", Percent, "%)")),
position = position_stack(vjust = 0.5), color = "black", size = 5) +
labs(title = "Feature Counts by Category", x = NULL, y = "Number of Features") +
coord_flip(expand = FALSE) +
scale_fill_viridis(discrete = TRUE, option = "C") + # modern, contrasting colors
theme_minimal(base_size = 12) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "right"
) +
# Make plot very wide and not tall
theme(aspect.ratio = 0.25)# Select your variables
df_selected <- df_test %>%
dplyr::select(
years_of_education,
age,
bmi_vsorres..BMI,
whr_vsorres..Waist.to.Hip.Ratio..WHR.,
pulse_vsorres..Heart.Rate..bpm.,
sualcage,
sualckncf,
sri,
mean_pct_steps_morning,
mean_pct_steps_afternoon,
mean_pct_steps_evening,
dmlhex..How.often.would.you.say.that.you.engage.i,
dmlact..What.would.you.consider.your.typical.acti,
dmlvex..In.a.typical.week..how.often.do.you.engag,
hr_from_ecg,
diet7..How.many.times.a.week.did.you.eat.desserts,
diet6..How.many.times.a.week.did.you.eat.regular.,
dmlpor..When.reflecting.on.your.typical.eating.ha,
dmlfrveg..In.a.typical.day..how.many.servings.of.,
dmlgrain..In.a.typical.day..how.often.do.you.cons,
dmlsugar..When.reflecting.on.your.typical.eating.,
sd_wake_clock_h,
sd_onset_clock_h,
cv_total_steps,
stress_overall_daily_mean_stress
)
feature_categories <- tidyr::tibble(
Feature = colnames(df_selected),
Category = c(
"Demographics", "Demographics", # years_of_education, age
"Anthropometrics & Vitals", "Anthropometrics & Vitals", "Anthropometrics & Vitals", # BMI, WHR, Pulse
"Alcohol / Substance Use", "Alcohol / Substance Use", # sualcage, sualckncf
"Stress", # sri
"ECG / Heart Rate & HRV", "ECG / Heart Rate & HRV",
"Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", # steps and activity features
"Sleep", "Sleep", # sd_wake_clock_h, sd_onset_clock_h
"Nutrition", "Nutrition", "Nutrition", "Nutrition", "Nutrition", "Nutrition" # diet features
)
)
# ------------------------------
# 2. Count features per category
# ------------------------------
feature_counts <- feature_categories %>%
group_by(Category) %>%
summarise(Count = n(), .groups = "drop") %>%
mutate(Percent = round(Count / sum(Count) * 100, 1))
# ------------------------------
# 3. Stacked bar plot
# ------------------------------
ggplot(feature_counts, aes(x = "", y = Count, fill = Category)) +
geom_bar(stat = "identity", width = 0.7) + # thicker bars
geom_text(aes(label = paste0(Count, " (", Percent, "%)")),
position = position_stack(vjust = 0.5), color = "black", size = 5) +
labs(title = "Feature Counts by Category", x = "", y = "Number of Features") +
coord_flip(expand = FALSE) + # remove extra padding for wider look
scale_fill_brewer(palette = "Blues") +
theme_classic(base_size = 10) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())# Keep only numeric columns
df_numeric <- df_selected %>%
select(where(is.numeric))
# Compute correlation matrix
cor_matrix <- cor(df_numeric, use = "pairwise.complete.obs")
# 1. Compute correlation matrix
cor_matrix <- cor(df_numeric, use = "pairwise.complete.obs")
cor_matrix[is.na(cor_matrix)] <- 0 # handle zero-variance columns
# 2. Convert to long table
cor_table <- as.data.frame(as.table(cor_matrix)) %>%
filter(Var1 != Var2) # remove self-correlations
# 3. Remove duplicate pairs (A,B) == (B,A)
cor_table_unique <- cor_table %>%
rowwise() %>%
mutate(pair = paste(sort(c(Var1, Var2)), collapse = "_")) %>%
distinct(pair, .keep_all = TRUE) %>%
ungroup() %>%
dplyr::select(Var1, Var2, Freq)
# 4. Rename Freq to Correlation
cor_table_unique <- cor_table_unique %>%
dplyr::rename(Correlation = Freq)
# 5. Sort descending by absolute correlation
cor_table_unique <- cor_table_unique %>%
dplyr::arrange(desc(abs(Correlation)))
# 6. Print table with kable
knitr::kable(cor_table_unique, digits = 2, caption = "All Correlations (Sorted by |Correlation|)")| Var1 | Var2 | Correlation |
|---|---|---|
| hr_from_ecg | pulse_vsorres..Heart.Rate..bpm. | 0.75 |
| cv_total_steps | mean_pct_steps_afternoon | -0.63 |
| dmlsugar..When.reflecting.on.your.typical.eating. | diet7..How.many.times.a.week.did.you.eat.desserts | 0.62 |
| cv_total_steps | mean_pct_steps_evening | -0.59 |
| sd_onset_clock_h | sd_wake_clock_h | 0.51 |
| stress_overall_daily_mean_stress | hr_from_ecg | 0.51 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | dmlact..What.would.you.consider.your.typical.acti | 0.50 |
| dmlact..What.would.you.consider.your.typical.acti | dmlhex..How.often.would.you.say.that.you.engage.i | 0.49 |
| cv_total_steps | mean_pct_steps_morning | -0.49 |
| stress_overall_daily_mean_stress | mean_pct_steps_afternoon | 0.48 |
| stress_overall_daily_mean_stress | pulse_vsorres..Heart.Rate..bpm. | 0.46 |
| stress_overall_daily_mean_stress | cv_total_steps | -0.46 |
| mean_pct_steps_afternoon | mean_pct_steps_morning | 0.41 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | dmlhex..How.often.would.you.say.that.you.engage.i | 0.40 |
| diet6..How.many.times.a.week.did.you.eat.regular. | diet7..How.many.times.a.week.did.you.eat.desserts | 0.38 |
| diet6..How.many.times.a.week.did.you.eat.regular. | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.35 |
| dmlact..What.would.you.consider.your.typical.acti | bmi_vsorres..BMI | -0.34 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | dmlhex..How.often.would.you.say.that.you.engage.i | 0.34 |
| dmlhex..How.often.would.you.say.that.you.engage.i | bmi_vsorres..BMI | -0.33 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.31 |
| sd_wake_clock_h | mean_pct_steps_morning | -0.30 |
| stress_overall_daily_mean_stress | mean_pct_steps_evening | 0.29 |
| stress_overall_daily_mean_stress | mean_pct_steps_morning | 0.29 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | dmlact..What.would.you.consider.your.typical.acti | 0.29 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | years_of_education | 0.28 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | dmlpor..When.reflecting.on.your.typical.eating.ha | 0.27 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | diet6..How.many.times.a.week.did.you.eat.regular. | 0.27 |
| dmlsugar..When.reflecting.on.your.typical.eating. | diet6..How.many.times.a.week.did.you.eat.regular. | 0.27 |
| sd_wake_clock_h | mean_pct_steps_evening | 0.26 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | bmi_vsorres..BMI | -0.26 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | dmlfrveg..In.a.typical.day..how.many.servings.of. | 0.26 |
| sd_wake_clock_h | hr_from_ecg | 0.26 |
| dmlsugar..When.reflecting.on.your.typical.eating. | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.26 |
| mean_pct_steps_evening | mean_pct_steps_afternoon | 0.25 |
| mean_pct_steps_evening | age | -0.25 |
| bmi_vsorres..BMI | age | -0.25 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | dmlpor..When.reflecting.on.your.typical.eating.ha | 0.24 |
| stress_overall_daily_mean_stress | sd_onset_clock_h | 0.24 |
| mean_pct_steps_afternoon | bmi_vsorres..BMI | 0.24 |
| stress_overall_daily_mean_stress | bmi_vsorres..BMI | 0.24 |
| mean_pct_steps_morning | sualcage | 0.24 |
| sd_wake_clock_h | years_of_education | -0.23 |
| dmlsugar..When.reflecting.on.your.typical.eating. | dmlpor..When.reflecting.on.your.typical.eating.ha | 0.23 |
| sd_onset_clock_h | mean_pct_steps_evening | 0.23 |
| stress_overall_daily_mean_stress | dmlact..What.would.you.consider.your.typical.acti | -0.23 |
| sd_onset_clock_h | pulse_vsorres..Heart.Rate..bpm. | 0.23 |
| hr_from_ecg | bmi_vsorres..BMI | 0.22 |
| diet6..How.many.times.a.week.did.you.eat.regular. | dmlhex..How.often.would.you.say.that.you.engage.i | 0.22 |
| pulse_vsorres..Heart.Rate..bpm. | bmi_vsorres..BMI | 0.22 |
| stress_overall_daily_mean_stress | dmlvex..In.a.typical.week..how.often.do.you.engag | -0.22 |
| sd_onset_clock_h | hr_from_ecg | 0.22 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | years_of_education | 0.22 |
| hr_from_ecg | sualckncf | 0.22 |
| bmi_vsorres..BMI | years_of_education | -0.22 |
| mean_pct_steps_morning | age | 0.21 |
| hr_from_ecg | age | -0.21 |
| dmlsugar..When.reflecting.on.your.typical.eating. | dmlgrain..In.a.typical.day..how.often.do.you.cons | 0.21 |
| hr_from_ecg | dmlact..What.would.you.consider.your.typical.acti | -0.20 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | dmlhex..How.often.would.you.say.that.you.engage.i | 0.20 |
| sri | age | -0.20 |
| sd_onset_clock_h | sualcage | 0.20 |
| hr_from_ecg | dmlhex..How.often.would.you.say.that.you.engage.i | -0.20 |
| hr_from_ecg | dmlvex..In.a.typical.week..how.often.do.you.engag | -0.19 |
| pulse_vsorres..Heart.Rate..bpm. | age | -0.19 |
| dmlhex..How.often.would.you.say.that.you.engage.i | pulse_vsorres..Heart.Rate..bpm. | -0.19 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | bmi_vsorres..BMI | -0.19 |
| stress_overall_daily_mean_stress | sualcage | 0.18 |
| dmlact..What.would.you.consider.your.typical.acti | sualckncf | -0.18 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | dmlact..What.would.you.consider.your.typical.acti | 0.18 |
| cv_total_steps | age | 0.18 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | sri | 0.18 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | diet7..How.many.times.a.week.did.you.eat.desserts | 0.18 |
| stress_overall_daily_mean_stress | sualckncf | 0.18 |
| dmlact..What.would.you.consider.your.typical.acti | years_of_education | 0.17 |
| diet6..How.many.times.a.week.did.you.eat.regular. | dmlact..What.would.you.consider.your.typical.acti | 0.17 |
| dmlsugar..When.reflecting.on.your.typical.eating. | sri | 0.17 |
| stress_overall_daily_mean_stress | dmlhex..How.often.would.you.say.that.you.engage.i | -0.17 |
| dmlhex..How.often.would.you.say.that.you.engage.i | sualckncf | -0.17 |
| stress_overall_daily_mean_stress | years_of_education | -0.17 |
| cv_total_steps | sd_onset_clock_h | -0.17 |
| age | years_of_education | 0.17 |
| dmlhex..How.often.would.you.say.that.you.engage.i | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.16 |
| stress_overall_daily_mean_stress | dmlfrveg..In.a.typical.day..how.many.servings.of. | -0.16 |
| sri | years_of_education | 0.16 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | dmlhex..How.often.would.you.say.that.you.engage.i | 0.16 |
| cv_total_steps | sualcage | -0.16 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | hr_from_ecg | -0.16 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | sualckncf | -0.16 |
| diet7..How.many.times.a.week.did.you.eat.desserts | sri | 0.16 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.15 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | years_of_education | 0.15 |
| diet7..How.many.times.a.week.did.you.eat.desserts | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.15 |
| sd_onset_clock_h | dmlsugar..When.reflecting.on.your.typical.eating. | -0.15 |
| sd_onset_clock_h | years_of_education | -0.15 |
| sd_onset_clock_h | dmlact..What.would.you.consider.your.typical.acti | -0.15 |
| sd_onset_clock_h | age | -0.15 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | pulse_vsorres..Heart.Rate..bpm. | -0.15 |
| sd_onset_clock_h | bmi_vsorres..BMI | 0.15 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.15 |
| diet7..How.many.times.a.week.did.you.eat.desserts | mean_pct_steps_afternoon | 0.15 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | pulse_vsorres..Heart.Rate..bpm. | -0.15 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | sualcage | 0.15 |
| diet7..How.many.times.a.week.did.you.eat.desserts | years_of_education | 0.14 |
| dmlsugar..When.reflecting.on.your.typical.eating. | dmlact..What.would.you.consider.your.typical.acti | 0.14 |
| stress_overall_daily_mean_stress | sd_wake_clock_h | 0.14 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | mean_pct_steps_afternoon | -0.14 |
| sd_wake_clock_h | bmi_vsorres..BMI | 0.14 |
| sualcage | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.14 |
| hr_from_ecg | years_of_education | -0.14 |
| mean_pct_steps_evening | years_of_education | -0.14 |
| dmlsugar..When.reflecting.on.your.typical.eating. | pulse_vsorres..Heart.Rate..bpm. | -0.14 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | sri | 0.13 |
| sd_wake_clock_h | diet7..How.many.times.a.week.did.you.eat.desserts | 0.13 |
| diet6..How.many.times.a.week.did.you.eat.regular. | mean_pct_steps_afternoon | 0.13 |
| hr_from_ecg | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.13 |
| pulse_vsorres..Heart.Rate..bpm. | years_of_education | -0.13 |
| dmlhex..How.often.would.you.say.that.you.engage.i | years_of_education | 0.13 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | pulse_vsorres..Heart.Rate..bpm. | -0.13 |
| mean_pct_steps_afternoon | pulse_vsorres..Heart.Rate..bpm. | 0.13 |
| sd_onset_clock_h | dmlhex..How.often.would.you.say.that.you.engage.i | -0.13 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | sualckncf | -0.13 |
| dmlact..What.would.you.consider.your.typical.acti | pulse_vsorres..Heart.Rate..bpm. | -0.13 |
| dmlsugar..When.reflecting.on.your.typical.eating. | dmlfrveg..In.a.typical.day..how.many.servings.of. | 0.13 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | sualcage | 0.13 |
| sd_wake_clock_h | pulse_vsorres..Heart.Rate..bpm. | 0.13 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.12 |
| dmlact..What.would.you.consider.your.typical.acti | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.12 |
| dmlsugar..When.reflecting.on.your.typical.eating. | years_of_education | 0.12 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | dmlact..What.would.you.consider.your.typical.acti | 0.12 |
| dmlhex..How.often.would.you.say.that.you.engage.i | age | 0.12 |
| dmlact..What.would.you.consider.your.typical.acti | age | 0.12 |
| sd_onset_clock_h | dmlgrain..In.a.typical.day..how.often.do.you.cons | -0.11 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | diet6..How.many.times.a.week.did.you.eat.regular. | 0.11 |
| dmlsugar..When.reflecting.on.your.typical.eating. | hr_from_ecg | -0.11 |
| stress_overall_daily_mean_stress | sri | -0.11 |
| diet7..How.many.times.a.week.did.you.eat.desserts | dmlhex..How.often.would.you.say.that.you.engage.i | 0.11 |
| stress_overall_daily_mean_stress | dmlsugar..When.reflecting.on.your.typical.eating. | -0.11 |
| sd_wake_clock_h | age | -0.11 |
| mean_pct_steps_morning | years_of_education | 0.11 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | bmi_vsorres..BMI | -0.11 |
| whr_vsorres..Waist.to.Hip.Ratio..WHR. | age | 0.11 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | mean_pct_steps_morning | 0.11 |
| hr_from_ecg | mean_pct_steps_morning | -0.10 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | mean_pct_steps_morning | -0.10 |
| sd_onset_clock_h | sualckncf | -0.10 |
| cv_total_steps | dmlsugar..When.reflecting.on.your.typical.eating. | 0.10 |
| diet7..How.many.times.a.week.did.you.eat.desserts | hr_from_ecg | -0.10 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | mean_pct_steps_afternoon | -0.10 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | diet7..How.many.times.a.week.did.you.eat.desserts | 0.10 |
| cv_total_steps | dmlact..What.would.you.consider.your.typical.acti | 0.10 |
| cv_total_steps | bmi_vsorres..BMI | -0.10 |
| mean_pct_steps_morning | bmi_vsorres..BMI | 0.10 |
| hr_from_ecg | sri | -0.10 |
| hr_from_ecg | mean_pct_steps_evening | 0.10 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | pulse_vsorres..Heart.Rate..bpm. | -0.10 |
| stress_overall_daily_mean_stress | age | -0.10 |
| hr_from_ecg | mean_pct_steps_afternoon | 0.10 |
| sd_wake_clock_h | dmlsugar..When.reflecting.on.your.typical.eating. | -0.10 |
| mean_pct_steps_afternoon | sri | -0.10 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.10 |
| dmlact..What.would.you.consider.your.typical.acti | sri | 0.09 |
| cv_total_steps | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.09 |
| cv_total_steps | diet7..How.many.times.a.week.did.you.eat.desserts | -0.09 |
| stress_overall_daily_mean_stress | diet7..How.many.times.a.week.did.you.eat.desserts | 0.09 |
| sri | sualcage | 0.09 |
| sd_wake_clock_h | dmlvex..In.a.typical.week..how.often.do.you.engag | -0.09 |
| sd_wake_clock_h | dmlact..What.would.you.consider.your.typical.acti | -0.09 |
| cv_total_steps | sri | -0.09 |
| sd_wake_clock_h | diet6..How.many.times.a.week.did.you.eat.regular. | 0.09 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | age | 0.09 |
| dmlsugar..When.reflecting.on.your.typical.eating. | mean_pct_steps_evening | -0.09 |
| sualckncf | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.09 |
| sualckncf | bmi_vsorres..BMI | 0.09 |
| sd_wake_clock_h | mean_pct_steps_afternoon | 0.09 |
| diet6..How.many.times.a.week.did.you.eat.regular. | years_of_education | 0.08 |
| diet7..How.many.times.a.week.did.you.eat.desserts | pulse_vsorres..Heart.Rate..bpm. | -0.08 |
| mean_pct_steps_evening | pulse_vsorres..Heart.Rate..bpm. | 0.08 |
| diet6..How.many.times.a.week.did.you.eat.regular. | hr_from_ecg | -0.08 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | diet7..How.many.times.a.week.did.you.eat.desserts | 0.08 |
| dmlact..What.would.you.consider.your.typical.acti | mean_pct_steps_afternoon | -0.08 |
| mean_pct_steps_afternoon | years_of_education | -0.08 |
| diet6..How.many.times.a.week.did.you.eat.regular. | sualcage | -0.08 |
| cv_total_steps | dmlhex..How.often.would.you.say.that.you.engage.i | -0.08 |
| whr_vsorres..Waist.to.Hip.Ratio..WHR. | years_of_education | -0.08 |
| mean_pct_steps_evening | sualcage | 0.08 |
| sd_onset_clock_h | dmlfrveg..In.a.typical.day..how.many.servings.of. | 0.08 |
| diet6..How.many.times.a.week.did.you.eat.regular. | mean_pct_steps_evening | 0.08 |
| mean_pct_steps_evening | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.07 |
| dmlhex..How.often.would.you.say.that.you.engage.i | sri | 0.07 |
| stress_overall_daily_mean_stress | dmlgrain..In.a.typical.day..how.often.do.you.cons | -0.07 |
| cv_total_steps | pulse_vsorres..Heart.Rate..bpm. | -0.07 |
| dmlhex..How.often.would.you.say.that.you.engage.i | mean_pct_steps_evening | 0.07 |
| sri | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.07 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | sualcage | 0.07 |
| diet6..How.many.times.a.week.did.you.eat.regular. | sri | -0.07 |
| dmlsugar..When.reflecting.on.your.typical.eating. | dmlhex..How.often.would.you.say.that.you.engage.i | 0.07 |
| dmlsugar..When.reflecting.on.your.typical.eating. | bmi_vsorres..BMI | -0.07 |
| sualcage | pulse_vsorres..Heart.Rate..bpm. | -0.07 |
| mean_pct_steps_evening | sualckncf | -0.07 |
| cv_total_steps | dmlgrain..In.a.typical.day..how.often.do.you.cons | 0.07 |
| sd_onset_clock_h | mean_pct_steps_morning | -0.07 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | mean_pct_steps_evening | -0.06 |
| dmlsugar..When.reflecting.on.your.typical.eating. | mean_pct_steps_morning | -0.06 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | diet6..How.many.times.a.week.did.you.eat.regular. | 0.06 |
| diet6..How.many.times.a.week.did.you.eat.regular. | mean_pct_steps_morning | 0.06 |
| dmlact..What.would.you.consider.your.typical.acti | sualcage | 0.06 |
| cv_total_steps | dmlpor..When.reflecting.on.your.typical.eating.ha | 0.06 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | age | 0.06 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.06 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | mean_pct_steps_morning | -0.06 |
| sualckncf | pulse_vsorres..Heart.Rate..bpm. | 0.06 |
| sri | bmi_vsorres..BMI | -0.06 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | sualckncf | 0.06 |
| sualcage | bmi_vsorres..BMI | 0.06 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | age | 0.06 |
| sd_onset_clock_h | dmlpor..When.reflecting.on.your.typical.eating.ha | 0.06 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | mean_pct_steps_evening | -0.06 |
| sri | pulse_vsorres..Heart.Rate..bpm. | -0.06 |
| pulse_vsorres..Heart.Rate..bpm. | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.06 |
| mean_pct_steps_morning | sri | -0.06 |
| sd_wake_clock_h | sualckncf | -0.06 |
| sd_wake_clock_h | sri | -0.06 |
| dmlsugar..When.reflecting.on.your.typical.eating. | mean_pct_steps_afternoon | -0.06 |
| sd_wake_clock_h | dmlgrain..In.a.typical.day..how.often.do.you.cons | -0.05 |
| sd_onset_clock_h | dmlvex..In.a.typical.week..how.often.do.you.engag | -0.05 |
| mean_pct_steps_afternoon | age | -0.05 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | sualcage | 0.05 |
| sualcage | years_of_education | 0.05 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | mean_pct_steps_evening | 0.05 |
| cv_total_steps | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.05 |
| stress_overall_daily_mean_stress | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.05 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | sualckncf | 0.05 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | sri | -0.04 |
| cv_total_steps | diet6..How.many.times.a.week.did.you.eat.regular. | -0.04 |
| sd_wake_clock_h | sualcage | -0.04 |
| diet7..How.many.times.a.week.did.you.eat.desserts | bmi_vsorres..BMI | 0.04 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | sri | 0.04 |
| diet7..How.many.times.a.week.did.you.eat.desserts | age | 0.04 |
| cv_total_steps | sualckncf | 0.04 |
| diet6..How.many.times.a.week.did.you.eat.regular. | pulse_vsorres..Heart.Rate..bpm. | -0.04 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | bmi_vsorres..BMI | -0.04 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | mean_pct_steps_evening | 0.04 |
| sd_onset_clock_h | mean_pct_steps_afternoon | 0.04 |
| dmlact..What.would.you.consider.your.typical.acti | mean_pct_steps_evening | -0.04 |
| sualckncf | years_of_education | -0.04 |
| diet7..How.many.times.a.week.did.you.eat.desserts | sualckncf | -0.04 |
| diet6..How.many.times.a.week.did.you.eat.regular. | age | -0.04 |
| mean_pct_steps_morning | pulse_vsorres..Heart.Rate..bpm. | -0.04 |
| sd_onset_clock_h | diet6..How.many.times.a.week.did.you.eat.regular. | -0.04 |
| diet7..How.many.times.a.week.did.you.eat.desserts | sualcage | 0.04 |
| whr_vsorres..Waist.to.Hip.Ratio..WHR. | bmi_vsorres..BMI | 0.03 |
| mean_pct_steps_afternoon | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.03 |
| mean_pct_steps_evening | mean_pct_steps_morning | -0.03 |
| stress_overall_daily_mean_stress | diet6..How.many.times.a.week.did.you.eat.regular. | 0.03 |
| sd_wake_clock_h | dmlfrveg..In.a.typical.day..how.many.servings.of. | 0.03 |
| diet6..How.many.times.a.week.did.you.eat.regular. | sualckncf | -0.03 |
| stress_overall_daily_mean_stress | dmlpor..When.reflecting.on.your.typical.eating.ha | -0.03 |
| diet7..How.many.times.a.week.did.you.eat.desserts | mean_pct_steps_morning | 0.03 |
| dmlact..What.would.you.consider.your.typical.acti | mean_pct_steps_morning | 0.03 |
| cv_total_steps | hr_from_ecg | -0.03 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | mean_pct_steps_afternoon | -0.03 |
| sd_wake_clock_h | dmlhex..How.often.would.you.say.that.you.engage.i | 0.03 |
| dmlfrveg..In.a.typical.day..how.many.servings.of. | age | -0.02 |
| diet7..How.many.times.a.week.did.you.eat.desserts | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.02 |
| sd_onset_clock_h | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.02 |
| diet6..How.many.times.a.week.did.you.eat.regular. | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.02 |
| sd_wake_clock_h | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.02 |
| sri | sualckncf | 0.02 |
| diet7..How.many.times.a.week.did.you.eat.desserts | dmlact..What.would.you.consider.your.typical.acti | 0.02 |
| sd_wake_clock_h | dmlpor..When.reflecting.on.your.typical.eating.ha | -0.02 |
| cv_total_steps | sd_wake_clock_h | 0.02 |
| mean_pct_steps_morning | whr_vsorres..Waist.to.Hip.Ratio..WHR. | -0.02 |
| dmlsugar..When.reflecting.on.your.typical.eating. | age | -0.02 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | hr_from_ecg | -0.02 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | hr_from_ecg | -0.02 |
| dmlvex..In.a.typical.week..how.often.do.you.engag | mean_pct_steps_morning | -0.02 |
| dmlsugar..When.reflecting.on.your.typical.eating. | sualcage | -0.02 |
| diet6..How.many.times.a.week.did.you.eat.regular. | bmi_vsorres..BMI | -0.02 |
| diet7..How.many.times.a.week.did.you.eat.desserts | mean_pct_steps_evening | 0.01 |
| dmlsugar..When.reflecting.on.your.typical.eating. | sualckncf | -0.01 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | mean_pct_steps_afternoon | 0.01 |
| dmlhex..How.often.would.you.say.that.you.engage.i | mean_pct_steps_morning | 0.01 |
| dmlpor..When.reflecting.on.your.typical.eating.ha | years_of_education | -0.01 |
| dmlgrain..In.a.typical.day..how.often.do.you.cons | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.01 |
| sd_onset_clock_h | diet7..How.many.times.a.week.did.you.eat.desserts | 0.01 |
| mean_pct_steps_evening | sri | 0.01 |
| cv_total_steps | dmlfrveg..In.a.typical.day..how.many.servings.of. | 0.01 |
| sualcage | age | -0.01 |
| cv_total_steps | years_of_education | 0.01 |
| dmlhex..How.often.would.you.say.that.you.engage.i | mean_pct_steps_afternoon | 0.01 |
| mean_pct_steps_morning | sualckncf | -0.01 |
| sd_onset_clock_h | sri | 0.00 |
| mean_pct_steps_evening | bmi_vsorres..BMI | 0.00 |
| mean_pct_steps_afternoon | sualckncf | 0.00 |
| hr_from_ecg | sualcage | 0.00 |
| dmlhex..How.often.would.you.say.that.you.engage.i | sualcage | 0.00 |
| sualckncf | age | 0.00 |
| mean_pct_steps_afternoon | sualcage | 0.00 |
| dmlsugar..When.reflecting.on.your.typical.eating. | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.00 |
| sualckncf | sualcage | 0.00 |
# 7. Summary description
# Example summary_stats dataframe
summary_stats <- cor_table_unique %>%
dplyr::summarize(
Mean_Correlation = mean(Correlation, na.rm = TRUE),
Median_Correlation = median(Correlation, na.rm = TRUE),
Max_Correlation = max(Correlation, na.rm = TRUE),
Min_Correlation = min(Correlation, na.rm = TRUE)
)
# Display as horizontal table
kable(summary_stats, "html", caption = "Summary of Correlations") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, font_size = 14) %>%
row_spec(0, bold = TRUE, background = "#A5D8FF")| Mean_Correlation | Median_Correlation | Max_Correlation | Min_Correlation |
|---|---|---|---|
| 0.0260519 | 0.0115898 | 0.7517405 | -0.6285951 |
X <- df_test %>%
dplyr::select(
fh_dm2sb_cat,
years_of_education,
age,
bmi_vsorres..BMI,
whr_vsorres..Waist.to.Hip.Ratio..WHR.,
pulse_vsorres..Heart.Rate..bpm.,
sualcage,
sualckncf,
#hrv_rmssd,
sri,
mean_pct_steps_morning,
mean_pct_steps_evening,
dmlhex..How.often.would.you.say.that.you.engage.i,
dmlvex..In.a.typical.week..how.often.do.you.engag,
hr_from_ecg,
mean_active_minutes,
diet6..How.many.times.a.week.did.you.eat.regular.,
diet1..How.many.times.a.week.did.you.eat.fast.foo,
dmlgrain..In.a.typical.day..how.often.do.you.cons,
dmlsugar..When.reflecting.on.your.typical.eating.,
cv_total_steps,
stress_overall_daily_mean_stress) %>%
mutate(across(where(is.factor), ~ factor(trimws(.))))
# ------------------------------
# 2. Define target variable
# ------------------------------
# 1. Clean target
X <- X %>% mutate(across(where(is.numeric), as.double))
y <- factor(trimws(df_test$IR_label))
levels(y) <- make.names(levels(y)) # ensures valid R names
# 2. Combine with predictors
df_model <- X
df_model$y <- y
#df_model <- na.omit(df_model)
X_clean <- df_model %>% select(-y)
y_clean <- df_model$y
# 3. Train/test split
set.seed(123)
train_index <- createDataPartition(y_clean, p = 0.8, list = FALSE)
X_train <- X_clean[train_index, ]
X_test <- X_clean[-train_index, ]
y_train <- y_clean[train_index]
y_test <- y_clean[-train_index]
# 4. Train RF
set.seed(123)
train_control <- trainControl(
method = "cv",
number = 10,
sampling = "up" ,
savePredictions = "final",
classProbs = TRUE
)
set.seed(123)
model_rf <- train(
x = X_train,
y = y_train,
method = "rf",
trControl = train_control,
preProcess = c("medianImpute"),
tuneLength = 5
)
# 5. Evaluate
pred_class <- predict(model_rf, X_test)
cm <- caret::confusionMatrix(pred_class, y_test)
accuracy <- as.numeric(cm$overall["Accuracy"])
sensitivity <- as.numeric(cm$byClass["Sensitivity"])
specificity <- as.numeric(cm$byClass["Specificity"])
precision <- as.numeric(cm$byClass["Pos Pred Value"])
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
# Compute Balanced Accuracy
balanced_accuracy <- (sensitivity + specificity) / 2
# Create metrics dataframe including Balanced Accuracy
metrics <- data.frame(
Accuracy = accuracy,
Sensitivity = sensitivity,
Specificity = specificity,
Balanced_Accuracy = balanced_accuracy,
Precision = precision,
F1_Score = f1_score
)
# Example metrics dataframe
metrics <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "Balanced Accuracy", "Precision", "F1 Score"),
Value = c(accuracy, sensitivity, specificity, balanced_accuracy, precision, f1_score)
)
# Create a nice HTML table
kable(metrics, "html", caption = "Model Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = T, font_size = 14) %>%
row_spec(0, bold = TRUE, background = "#A5D8FF") %>%
column_spec(2, bold = TRUE)| Metric | Value |
|---|---|
| Accuracy | 0.7407407 |
| Sensitivity | 0.8333333 |
| Specificity | 0.6666667 |
| Balanced Accuracy | 0.7500000 |
| Precision | 0.6666667 |
| F1 Score | 0.7407407 |
# 2. ROC Curve & AUC
# ------------------------------
pred_probs <- predict(model_rf, X_test, type = "prob")[, "IR"]
roc_obj <- roc(y_test, pred_probs, levels = rev(levels(y_test)))
# ROC plot using ggplot2
roc_df <- data.frame(
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities
)
roc_plot = ggplot(roc_df, aes(x = FPR, y = TPR)) +
geom_line(color = "#2C7BB6", size = 1.2) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey50") +
annotate("text", x = 0.6, y = 0.1, label = paste0("AUC = ", round(auc(roc_obj), 3)), size = 5) +
labs(title = "ROC Curve", x = "False Positive Rate (1 - Specificity)", y = "True Positive Rate (Sensitivity)") +
theme_classic(base_size = 14) +
theme(aspect.ratio=0.9)
# ------------------------------
# 3. Precision-Recall Curve & PRAUC
# ------------------------------
# Convert y_test to binary numeric: 1 = IR, 0 = Non-IR
y_bin <- ifelse(y_test == "IR", 1, 0)
pr_obj <- pr.curve(scores.class0 = pred_probs, weights.class0 = y_bin, curve = TRUE)
pr_df <- data.frame(
Recall = pr_obj$curve[,1],
Precision = pr_obj$curve[,2]
)
pr_plot = ggplot(pr_df, aes(x = Recall, y = Precision)) +
geom_line(color = "#D7191C", size = 1.2) +
annotate("text", x = 0.6, y = 0.1, label = paste0("PRAUC = ", round(pr_obj$auc.integral, 3)), size = 5) +
labs(title = "Precision-Recall Curve", x = "Recall (Sensitivity)", y = "Precision") +
theme_classic(base_size = 14) +
theme(aspect.ratio=0.9)
combined_plot <- roc_plot + pr_plot + plot_layout(ncol = 2)
combined_plot # shap <- Shapley$new(predictor, x.interest = X_train[1, ]) # example: first row
# # If you want all rows, you can loop or sample (expensive for large datasets)
#
# # Extract SHAP values for the first observation
# shap_values <- shap$results
# print(shap_values)
# ------------------------------
# 4. Summary plot (global feature importance via SHAP)
# ------------------------------
# shapley_summary <- FeatureImp$new(predictor, loss = "ce") # cross-entropy loss for classification
# plot(shapley_summary) # shows global feature importancen_runs <- 50
all_var_imp <- matrix(NA, nrow = ncol(X_train), ncol = n_runs)
rownames(all_var_imp) <- colnames(X_train)
set.seed(123)
for (i in 1:n_runs) {
train_idx <- createDataPartition(y_clean, p = 0.7, list = FALSE)
X_tr <- X_clean[train_idx, ]
X_te <- X_clean[-train_idx, ]
y_tr <- y_clean[train_idx]
y_te <- y_clean[-train_idx]
rf <- train(
x = X_tr,
y = y_tr,
method = "rf",
trControl = trainControl(method = "cv", number = 5),
preProcess = c("medianImpute"),
tuneLength = 5
)
imp <- varImp(rf, scale = TRUE)$importance
all_var_imp[, i] <- imp$Overall
}
var_imp_mean <- rowMeans(all_var_imp)
var_imp_sd <- apply(all_var_imp, 1, sd)
var_imp_stable <- data.frame(
Feature = rownames(all_var_imp),
MeanImportance = var_imp_mean,
SDImportance = var_imp_sd
) %>% arrange(desc(MeanImportance))
print(head(var_imp_stable, 20))## Feature
## bmi_vsorres..BMI bmi_vsorres..BMI
## whr_vsorres..Waist.to.Hip.Ratio..WHR. whr_vsorres..Waist.to.Hip.Ratio..WHR.
## stress_overall_daily_mean_stress stress_overall_daily_mean_stress
## hr_from_ecg hr_from_ecg
## sri sri
## mean_active_minutes mean_active_minutes
## pulse_vsorres..Heart.Rate..bpm. pulse_vsorres..Heart.Rate..bpm.
## mean_pct_steps_morning mean_pct_steps_morning
## mean_pct_steps_evening mean_pct_steps_evening
## age age
## cv_total_steps cv_total_steps
## sualcage sualcage
## years_of_education years_of_education
## dmlhex..How.often.would.you.say.that.you.engage.i dmlhex..How.often.would.you.say.that.you.engage.i
## diet6..How.many.times.a.week.did.you.eat.regular. diet6..How.many.times.a.week.did.you.eat.regular.
## dmlvex..In.a.typical.week..how.often.do.you.engag dmlvex..In.a.typical.week..how.often.do.you.engag
## fh_dm2sb_cat fh_dm2sb_cat
## dmlsugar..When.reflecting.on.your.typical.eating. dmlsugar..When.reflecting.on.your.typical.eating.
## dmlgrain..In.a.typical.day..how.often.do.you.cons dmlgrain..In.a.typical.day..how.often.do.you.cons
## diet1..How.many.times.a.week.did.you.eat.fast.foo diet1..How.many.times.a.week.did.you.eat.fast.foo
## MeanImportance SDImportance
## bmi_vsorres..BMI 97.794220 7.545864
## whr_vsorres..Waist.to.Hip.Ratio..WHR. 72.945954 17.990147
## stress_overall_daily_mean_stress 41.423222 19.519506
## hr_from_ecg 28.679576 15.919824
## sri 26.796903 16.669371
## mean_active_minutes 26.728777 13.692341
## pulse_vsorres..Heart.Rate..bpm. 26.363365 14.957959
## mean_pct_steps_morning 22.181617 11.449555
## mean_pct_steps_evening 21.277253 11.878361
## age 20.500170 11.427062
## cv_total_steps 20.180248 11.926483
## sualcage 18.225723 9.409087
## years_of_education 17.030107 12.084327
## dmlhex..How.often.would.you.say.that.you.engage.i 15.056297 13.616977
## diet6..How.many.times.a.week.did.you.eat.regular. 14.953623 8.906635
## dmlvex..In.a.typical.week..how.often.do.you.engag 8.522026 7.229816
## fh_dm2sb_cat 6.963167 7.054268
## dmlsugar..When.reflecting.on.your.typical.eating. 4.843838 3.347394
## dmlgrain..In.a.typical.day..how.often.do.you.cons 4.475492 4.660171
## diet1..How.many.times.a.week.did.you.eat.fast.foo 3.068553 3.475977
# ------------------------------
# 9. SHAP values (conditional contribution)
library(randomForest)
library(iml)
# ------------------------------
predictor <- Predictor$new(
model = model_rf,
data = X_train,
y = y_train,
type = "prob"
)
# Example: compute SHAP for first 100 rows
sample_idx <- sample(1:nrow(X_train), min(100, nrow(X_train)))
shap_values_list <- lapply(sample_idx, function(i) {
shap <- Shapley$new(predictor, x.interest = X_train[i, ])
shap$results
})
shap_all <- do.call(rbind, shap_values_list)
shap_global <- shap_all %>%
group_by(feature) %>%
summarise(mean_abs_shap = mean(abs(phi))) %>%
arrange(desc(mean_abs_shap))
# Combine SHAP results
shap_all <- do.call(rbind, shap_values_list)
# Summarize global SHAP values
shap_global <- shap_all %>%
group_by(feature) %>%
summarise(mean_abs_shap = mean(abs(phi)), .groups = "drop") %>%
arrange(desc(mean_abs_shap))
# Create a nice table with kable
shap_global %>%
mutate(rank = row_number()) %>% # optional: add a rank column
select(rank, everything()) %>%
kable(
caption = "Global SHAP Feature Importance",
col.names = c("Rank", "Feature", "Mean |SHAP|"),
digits = 3,
align = c("c", "l", "c")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)| Rank | Feature | Mean |SHAP| |
|---|---|---|
| 1 | bmi_vsorres..BMI | 0.128 |
| 2 | whr_vsorres..Waist.to.Hip.Ratio..WHR. | 0.077 |
| 3 | stress_overall_daily_mean_stress | 0.028 |
| 4 | diet6..How.many.times.a.week.did.you.eat.regular. | 0.026 |
| 5 | hr_from_ecg | 0.023 |
| 6 | dmlhex..How.often.would.you.say.that.you.engage.i | 0.023 |
| 7 | pulse_vsorres..Heart.Rate..bpm. | 0.018 |
| 8 | sri | 0.015 |
| 9 | mean_pct_steps_evening | 0.014 |
| 10 | age | 0.014 |
| 11 | years_of_education | 0.014 |
| 12 | cv_total_steps | 0.014 |
| 13 | mean_active_minutes | 0.012 |
| 14 | sualcage | 0.011 |
| 15 | mean_pct_steps_morning | 0.011 |
| 16 | fh_dm2sb_cat | 0.010 |
| 17 | dmlsugar..When.reflecting.on.your.typical.eating. | 0.010 |
| 18 | dmlvex..In.a.typical.week..how.often.do.you.engag | 0.008 |
| 19 | dmlgrain..In.a.typical.day..how.often.do.you.cons | 0.005 |
| 20 | sualckncf | 0.005 |
| 21 | diet1..How.many.times.a.week.did.you.eat.fast.foo | 0.004 |
library(dplyr)
library(ggplot2)
library(tidyr)
# ------------------------------
# 1. Combine SHAP with categories
# ------------------------------
shap_categorized <- shap_global %>%
left_join(feature_categories, by = c("feature" = "Feature"))
# ------------------------------
# 2. Compute total and percent contribution by category
# ------------------------------
category_shap <- shap_categorized %>%
group_by(Category) %>%
summarise(
total_shap = sum(mean_abs_shap),
.groups = "drop"
) %>%
mutate(percent_contribution = total_shap / sum(total_shap) * 100)
# ------------------------------
# 3. Plot
# ------------------------------
ggplot(category_shap, aes(x = reorder(Category, -percent_contribution), y = percent_contribution, fill = Category)) +
geom_bar(stat = "identity", color = "black") +
geom_text(aes(label = paste0(round(percent_contribution, 1), "%")), vjust = -0.5, size = 4) +
labs(
title = "SHAP Feature Contributions by Category",
x = "Feature Category",
y = "Percent of Total Mean |SHAP|"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none") +
coord_cartesian(ylim = c(0, max(category_shap$percent_contribution) * 1.1))df_external <- read.csv("~/external_test.csv", header = TRUE, stringsAsFactors = FALSE)
unhealthy_vars <- c(
"diet1..How.many.times.a.week.did.you.eat.fast.foo",
"diet4..How.many.regular.sodas.or.glasses.of.sweet",
"diet6..How.many.times.a.week.did.you.eat.regular.",
"diet7..How.many.times.a.week.did.you.eat.desserts",
"diet9..How.many.servings.of.fruit.juice.did.you.d"
)
df_external[unhealthy_vars] <- 2 - df_external[unhealthy_vars]
df_external$diet_unhealthy_score <- rowSums(df_external[unhealthy_vars], na.rm = TRUE)
predictor_names <- c(
"fh_dm2sb_cat",
"years_of_education",
"age",
"bmi_vsorres..BMI",
"whr_vsorres..Waist.to.Hip.Ratio..WHR.",
"pulse_vsorres..Heart.Rate..bpm.",
"sualcage",
"sualckncf",
"diet_unhealthy_score",
"sri",
"mean_pct_steps_morning",
"mean_pct_steps_afternoon",
"mean_pct_steps_evening",
"diet4..How.many.regular.sodas.or.glasses.of.sweet",
"dmlhex..How.often.would.you.say.that.you.engage.i",
"dmlact..What.would.you.consider.your.typical.acti",
"dmlvex..In.a.typical.week..how.often.do.you.engag",
"hr_from_ecg",
"mean_active_minutes",
"diet7..How.many.times.a.week.did.you.eat.desserts",
"diet6..How.many.times.a.week.did.you.eat.regular.",
"diet1..How.many.times.a.week.did.you.eat.fast.foo",
"diet9..How.many.servings.of.fruit.juice.did.you.d",
"dmlpor..When.reflecting.on.your.typical.eating.ha",
"dmlfrveg..In.a.typical.day..how.many.servings.of.",
"dmlgrain..In.a.typical.day..how.often.do.you.cons",
"dmlsugar..When.reflecting.on.your.typical.eating.",
"sd_wake_clock_h",
"sd_onset_clock_h",
"cv_total_steps",
"stress_overall_daily_mean_stress")
df_external$fh_dm2sb_cat <- trimws(tolower(df_external$fh_dm2sb_cat))
df_external$fh_dm2sb_cat <- factor(df_external$fh_dm2sb_cat, levels=c("no","yes"))
mode_val <- names(sort(table(X_train$fh_dm2sb_cat), decreasing = TRUE))[1]
# Replace NA in external dataset
df_external$fh_dm2sb_cat[is.na(df_external$fh_dm2sb_cat)] <- mode_val
# Ensure factor levels match training set
df_external$fh_dm2sb_cat <- factor(df_external$fh_dm2sb_cat, levels = levels(X_train$fh_dm2sb_cat))
X_ext <- df_external %>% dplyr::select(all_of(predictor_names))
pre_proc <- preProcess(X_train, method = "medianImpute")
X_ext_clean <- predict(pre_proc, X_ext)
y_ext <- df_external$IR_label
# Replace "Non-IR" → "Non.IR"
df_external$IR_label <- gsub("^Non-IR$", "Non.IR", df_external$IR_label)
# Fix any spacing issues
df_external$IR_label <- trimws(df_external$IR_label)
# Convert to factor with correct levels
y_ext <- factor(df_external$IR_label)
pred_class_ext <- predict(model_rf, X_ext_clean)
cm_ext <- caret::confusionMatrix(pred_class_ext, y_ext)
library(ggplot2)
library(dplyr)
library(scales)
# ------------------------------
# 1. Convert confusionMatrix to tidy dataframe
total_n <- length(y_train) + length(y_test)
split_counts <- data.frame(
Set = factor(c("Train", "Test"), levels = c("Train", "Test")), # force order
N = c(length(y_train), length(y_test))
) %>%
mutate(Percent = N / total_n * 100) # percentage
# ------------------------------
# 2. Plot
# ------------------------------
split_plot <- ggplot(split_counts, aes(x = Set, y = N, fill = Set)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = paste0(N, "\n(", round(Percent,1), "%)")),
vjust = -0.5, size = 5) +
scale_fill_brewer(palette = "Set3") +
labs(title = "Train vs Test Sample Sizes", y = "Number of Samples", x = "", face = "bold") +
theme_minimal(base_size = 8) +
theme(legend.position = "none") +
theme(aspect.ratio=0.9)
# ------------------------------
cm_ext_df <- as.data.frame(cm_ext$table) %>%
dplyr::rename(
True = Reference,
Predicted = Prediction,
Count = Freq
) %>%
group_by(True) %>%
mutate(
Percent = Count / sum(Count) # proportion per true class
) %>%
ungroup()
# ------------------------------
# 2. Plot confusion matrix heatmap
# ------------------------------
cm_ext_plot <- ggplot(cm_ext_df, aes(x = Predicted, y = True, fill = Percent)) +
geom_tile(color = "white", size = 0.5) +
geom_text(aes(label = paste0(Count, "\n(", round(Percent*100,1), "%)")), size = 6) +
scale_fill_gradient(low = "#DCE6F1", high = "#2C7BB6", labels = percent_format()) +
labs(
title = "External Test Set Confusion Matrix",
x = "Predicted Label",
y = "True Label",
fill = "Proportion"
) +
theme_minimal(base_size = 8) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.title = element_text(face = "bold"))
# ------------------------------
# 3. Display the plot
# ------------------------------
combined_plot <- split_plot + cm_ext_plot + plot_layout(ncol = 2, widths = c(2, 4))
combined_plot metrics_ext <- data.frame(
Accuracy = cm_ext$overall["Accuracy"],
Sensitivity = cm_ext$byClass["Sensitivity"],
Specificity = cm_ext$byClass["Specificity"],
Balanced_Accuracy = (cm_ext$byClass["Sensitivity"] + cm_ext$byClass["Specificity"]) / 2, # <-- new
Precision = cm_ext$byClass["Pos Pred Value"],
F1_Score = 2 * (cm_ext$byClass["Pos Pred Value"] * cm_ext$byClass["Sensitivity"]) /
(cm_ext$byClass["Pos Pred Value"] + cm_ext$byClass["Sensitivity"])
)
library(knitr)
library(kableExtra)
library(dplyr)
# ------------------------------
# 1. Clean numeric metrics for display
# ------------------------------
metrics_ext_clean <- metrics_ext %>%
# Convert all columns to numeric
mutate(across(everything(), ~ as.numeric(.))) %>%
# Round to 3 decimal places for readability
mutate(across(everything(), ~ round(., 3))) %>%
# Convert to long format for nice table
tidyr::pivot_longer(
cols = everything(),
names_to = "Metric",
values_to = "Value"
)
# ------------------------------
# 2. Create HTML kable
# ------------------------------
kable(metrics_ext_clean, "html", caption = "Extended Model Performance Metrics") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = T,
font_size = 14
) %>%
row_spec(0, bold = TRUE, background = "#A5D8FF") %>%
column_spec(2, bold = TRUE)| Metric | Value |
|---|---|
| Accuracy | 0.696 |
| Sensitivity | 0.577 |
| Specificity | 0.800 |
| Balanced_Accuracy | 0.688 |
| Precision | 0.714 |
| F1_Score | 0.638 |