IR-Prediction

1 Background

Insulin resistance (IR)—a state in which the body requires more insulin to achieve normal glucose regulation is a major predictor of type 2 diabetes. Early detection is essential, yet standard blood test measures such as the oral glucose tolerance test, euglycemic–hyperinsulinemic clamp, or the HOMA-IR require are invasive, costly, and cannot be used for frequent or continuous assessment.

Digital biomarkers—physiological, lifestyle and behavioral data collected via wearables and mobile devices—may potentially offer a scalable, non-invasive alternative for early IR screening. We evaluated whether non-invasive digital biomarkers combining sleep, physical activity, nutrfactors, and anthopopometrics, and nutrition can predict HOMA-IR–based IR in adults with normoglycemia, prediabetes and type-2 diabetes.

Demonstrating intitial feasibility, random forest models, built on lifestyle data and anthropometrics, predicted IR out-of-sample (Study 1: balanced accuracy = 0.71, 95% CI: 0.69–0.73, p = .020; AUC = 0.87, 95% CI: 0.85–0.88, p = .020) and replicated in an independent test cohort (Study 2: N=56; balanced accuracy = 0.71, 95% CI: 0.69–0.73, p = .020; AUC = 0.87, 95% CI: 0.85–0.88, p = .020); with top predictors including [insert features]. Results remained robust across modeling specifications.

Together, findings highlight the potential of digital biomarkers for early, non-invasive detection of insulin resistance,futher triggering reventive lifestyle interventions delivered in daily life.

2 Data Preparation

We use the Flagship Dataset of Type 2 Diabetes from open source AI-READI Project.

Background: (https://aireadi.org)
Documentation: https://docs.aireadi.org
Version 2 of the AI‑READI “Flagship Dataset of Type-2 Diabetes” was made publicly available on November 8,2024.
AI-READI, funded by the U.S. National Institutes of Health through the Bridge2AI Program, aims to collect and share a multimodal, AI-ready dataset for studying salutogenesis in Type 2 Diabetes Mellitus.

2.1 PRISMA inclusion

We include adult participants across three clinical sites (UAB, UW, UCSD) ) who are normoglycemic, have prediabetes, or have type 2 diabetes, and who meet the following criteria:
- Reported fasting (8hours or more) prior to glucose and insulin blood tests
- Are not on insulin medication
- Have available smartwatch data (Garmin venu)
- Have available blood test results (fasting glucose & insulin) to be able to calculate HOMA-IR (Homeostatic Model Assessment of Insulin Resistance)
We further split participants by site:
- UAB and UW for model development and testing (Study 1),
- UCSD is held out/ serves as the external model validation cohort (Study 2).

Both cohorts follow the same inclusion criteria described above.

# ========================================
# STEP 1: Load datasets
# ========================================
hba1c <- read.delim("~/Downloads/participants.tsv", header = TRUE, stringsAsFactors = FALSE)
biom  <- read.csv("~/Downloads/biomarker_data.csv", header = TRUE, stringsAsFactors = FALSE)

df <- merge(hba1c, biom, by.x = "participant_id", by.y = "person_id")
n_start <- length(unique(df$participant_id))

# ========================================
# STEP 2: Process observation file for fasting
# ========================================
ob <- read.delim("~/Downloads/observation(in).csv", header = TRUE, stringsAsFactors = FALSE)

ob_split <- ob %>%
  separate(
    col = 1,
    into = c("observation_id","person_id","observation_concept_id",
             "observation_date","observation_datetime","observation_type_concept_id",
             "value_as_number","value_as_string","value_as_concept_id",
             "qualifier_concept_id","unit_concept_id","provider_id",
             "visit_occurrence_id","visit_detail_id","observation_source_value",
             "observation_source_concept_id","unit_source_value","value_source_value",
             "value_source_concept_id","some_extra1","some_extra2"),
    sep = ",", fill = "right"
  )

paate_rows <- ob_split %>%
  filter(grepl("paate", observation_source_value, ignore.case = TRUE)) %>%
  select(value_as_string, person_id)

paate_rows$value_as_string <- as.numeric(paate_rows$value_as_string)
paate_rows$person_id <- as.integer(paate_rows$person_id)

fasting_rows <- paate_rows %>% filter(value_as_string >= 8)

# ========================================
# STEP 3: Sequential filtering
# ========================================
# Step 1: Non-insulin-dependent
df_step1 <- df %>% filter(!study_group %in% "insulin_dependent")
n_step1 <- length(unique(df_step1$participant_id))
removed_step1 <- n_start - n_step1

# Step 2: Fasting 8+ hours
df_step2 <- df_step1 %>%
  left_join(fasting_rows, by = c("participant_id" = "person_id")) %>%
  filter(!is.na(value_as_string))
n_step2 <- length(unique(df_step2$participant_id))
removed_step2 <- n_step1 - n_step2

# Step 3: Exclude UCSD site
df_step3 <- df_step2 %>% filter(clinical_site != "UCSD")
n_step3 <- length(unique(df_step3$participant_id))
removed_step3 <- n_step2 - n_step3

# Step 4: Wearable data available
df_step4 <- df_step3 %>% filter(wearable_activity_monitor == TRUE)
n_step4 <- length(unique(df_step4$participant_id))
removed_step4 <- n_step3 - n_step4

# Step 5: Remove participants with missing Glucose OR INSULIN
df_step5 <- df_step4 %>% filter(!is.na(Glucose..mg.dL.) & !is.na(INSULIN..ng.mL.))
n_step5 <- length(unique(df_step5$participant_id))
removed_step5 <- n_step4 - n_step5

# Step 6: Compute HOMA_insulin
df_step5$HOMA_insulin <- ((df_step5$INSULIN..ng.mL. * 24) * df_step5$Glucose..mg.dL.) / 405

# Step 7: Remove HOMA outliers (>3 SD above median)
median_homa <- median(df_step5$HOMA_insulin, na.rm = TRUE)
sd_homa <- sd(df_step5$HOMA_insulin, na.rm = TRUE)

df_final <- df_step5 %>%
  filter(HOMA_insulin <= (median_homa + 3 * sd_homa))
n_final <- length(unique(df_final$participant_id))
removed_step6 <- n_step5 - n_final

# ========================================
# UCSD-only filtering (Stepwise, remove other sites explicitly)
# ========================================
df_ucsd <- df  # start with full dataset

# Step 1: Non-insulin-dependent
df_ucsd_step1 <- df_ucsd %>% filter(!study_group %in% "insulin_dependent")
n_step1_ucsd <- length(unique(df_ucsd_step1$participant_id))
removed_step1_ucsd <- n_start - n_step1_ucsd  # same starting n_start

# Step 2: Fasting 8+ hours
df_ucsd_step2 <- df_ucsd_step1 %>%
  left_join(fasting_rows, by = c("participant_id" = "person_id")) %>%
  filter(!is.na(value_as_string))
n_step2_ucsd <- length(unique(df_ucsd_step2$participant_id))
removed_step2_ucsd <- n_step1_ucsd - n_step2_ucsd

# Step 3: Keep only UCSD, remove participants from other two sites
df_ucsd_step3 <- df_ucsd_step2 %>% filter(clinical_site == "UCSD")
n_step3_ucsd <- length(unique(df_ucsd_step3$participant_id))
removed_step3_ucsd <- n_step2_ucsd - n_step3_ucsd

# Step 4: Wearable data available
df_ucsd_step4 <- df_ucsd_step3 %>% filter(wearable_activity_monitor == TRUE)
n_step4_ucsd <- length(unique(df_ucsd_step4$participant_id))
removed_step4_ucsd <- n_step3_ucsd - n_step4_ucsd

# Step 5: Remove participants with missing Glucose OR INSULIN
df_ucsd_step5 <- df_ucsd_step4 %>% filter(!is.na(Glucose..mg.dL.) & !is.na(INSULIN..ng.mL.))
n_step5_ucsd <- length(unique(df_ucsd_step5$participant_id))
removed_step5_ucsd <- n_step4_ucsd - n_step5_ucsd

# Step 6: Compute HOMA_insulin
df_ucsd_step5$HOMA_insulin <- ((df_ucsd_step5$INSULIN..ng.mL. * 24) * df_ucsd_step5$Glucose..mg.dL.) / 405

# Step 7: Remove HOMA outliers (>3 SD above median)
median_homa_ucsd <- median(df_ucsd_step5$HOMA_insulin, na.rm = TRUE)
sd_homa_ucsd <- sd(df_ucsd_step5$HOMA_insulin, na.rm = TRUE)

df_ucsd_final <- df_ucsd_step5 %>%
  filter(HOMA_insulin <= (median_homa_ucsd + 3 * sd_homa_ucsd))
n_final_ucsd <- length(unique(df_ucsd_final$participant_id))
removed_step6_ucsd <- n_step5_ucsd - n_final_ucsd

# ========================================
# Side-by-side PRISMA (original visuals)
# ========================================
grViz(glue('
digraph prisma_sidebyside {{
  rankdir = TB;
  graph [layout = dot, nodesep = 0.5, ranksep = 0.75]

  node [
    shape = box,
    style = "rounded,filled",
    color = "#4A4A4A",
    fontcolor = "black",
    fillcolor = "#E8F0FE",
    fontsize = 12,
    penwidth = 1.2,
    width = 3
  ]

  # Study 1: Model Development & Testing
  subgraph cluster_a {{
    label = "Study 1: Model Development & Testing"
    a1 [label="Starting dataset\\n(n = {n_start})"]
    b1 [label="Non–insulin-dependent\\n(n = {n_step1})\\n(removed {removed_step1})"]
    c1 [label="Fasting 8+ hours\\n(n = {n_step2})\\n(removed {removed_step2})"]
    d1 [label="Clinical site != UCSD\\n(n = {n_step3})\\n(removed {removed_step3})"]
    e1 [label="Have wearable data\\n(n = {n_step4})\\n(removed {removed_step4})"]
    f1 [label="Glucose OR INSULIN not missing\\n(n = {n_step5})\\n(removed {removed_step5})"]
    g1 [label="HOMA outliers removed\\n(n = {n_final})\\n(removed {removed_step6})"]
    a1 -> b1 -> c1 -> d1 -> e1 -> f1 -> g1
  }}

  # Study 2: External Validation
  subgraph cluster_b {{
    label = "Study 2: UCSD-only (External Validation)"
    a2 [label="Starting dataset\\n(n = {n_start})"]
    b2 [label="Non–insulin-dependent\\n(n = {n_step1_ucsd})\\n(removed {removed_step1_ucsd})"]
    c2 [label="Fasting 8+ hours\\n(n = {n_step2_ucsd})\\n(removed {removed_step2_ucsd})"]
    d2 [label="Only UCSD site\\n(n = {n_step3_ucsd})\\n(removed {removed_step3_ucsd})"]
    e2 [label="Have wearable data\\n(n = {n_step4_ucsd})\\n(removed {removed_step4_ucsd})"]
    f2 [label="Glucose OR INSULIN not missing\\n(n = {n_step5_ucsd})\\n(removed {removed_step5_ucsd})"]
    g2 [label="HOMA outliers removed\\n(n = {n_final_ucsd})\\n(removed {removed_step6_ucsd})"]
    a2 -> b2 -> c2 -> d2 -> e2 -> f2 -> g2
  }}
}}
'))

2.2 HOMA-IR Calculation

In the next data preparation section, we take four steps to construct and refine our insulin-resistance measures. First, we calculate a person-level continuous insulin resistance score using the HOMA-IR (Homeostatic Model Assessment for Insulin Resistance) equation, which provides a widely used physiological estimate of insulin resistance derived from fasting glucose and insulin. Second, we inspect and treat outliers to reduce the influence of implausible or extreme values (e.g., non fasting values). Third, we binarize the continuous HOMA-IR score into “insulin resistant” versus “non-resistant” categories, allowing for categorical comparisons and easier interpretability in later modeling. Finally, we descriptively compare our binary insulin-resistance classification to the distribution of HbA1c, which helps illustrate how the HOMA-IR–based categorization aligns with established clinically relevant glycemic marker.

2.2.1 1. HOMA-IR Equation

HOMA-IS calculation

https://www.mdcalc.com/calc/3120/homa-ir-homeostatic-model-assessment-insulin-resistance

2.2.2 2. Outlier Inspection

We removed values that fell +3 SD deviations from the median to account for improbable values (e.g. individuals who likely did not fast; HOMA-IS >35).
Aplied to 3 individuals in Study 1 and to 1 individual in Study 2.

# -----------------------------
before <- data.frame(HOMA = c(df_step5$HOMA_insulin)) # replace with real data
before_no_outliers <- before %>%
  filter(HOMA >= mean(HOMA, na.rm = TRUE) - 3*sd(HOMA, na.rm = TRUE),
         HOMA <= mean(HOMA, na.rm = TRUE) + 3*sd(HOMA, na.rm = TRUE))

# -----------------------------
# PANEL A: Density Before
# -----------------------------
mean_val_A   <- mean(before$HOMA, na.rm = TRUE)
median_val_A <- median(before$HOMA, na.rm = TRUE)
sd_val_A     <- sd(before$HOMA, na.rm = TRUE)
min_val_A    <- min(before$HOMA, na.rm = TRUE)
max_val_A    <- max(before$HOMA, na.rm = TRUE)

dens_A <- density(before$HOMA, na.rm = TRUE)
ymax_plot_A <- max(dens_A$y)*1.05
legend_x_A <- min_val_A + 0.3*(max_val_A - min_val_A)
symbol_y_A <- seq(ymax_plot_A - 0.08*ymax_plot_A, ymax_plot_A - 0.22*ymax_plot_A, length.out = 3)
legend_y_top_A <- ymax_plot_A - 0.02*ymax_plot_A
legend_df_A <- data.frame(
  x = rep(legend_x_A, 3),
  y = symbol_y_A,
  label = c("Mean","Median","SD"),
  text = c(paste0("Mean: ", round(mean_val_A,2)),
           paste0("Median: ", round(median_val_A,2)),
           paste0("SD: ", round(sd_val_A,2)))
)
min_max_text_A <- paste0("Min: ", round(min_val_A,2), " | Max: ", round(max_val_A,2))

p_density_A <- ggplot(before, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_A, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_A, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_A + sd_val_A, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_A - sd_val_A, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_A, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_A, aes(x=x + 0.02*(max(before$HOMA)-min(before$HOMA)),
                                  y=y, label=text),
            hjust=0, color="black", size=3) +  # slightly smaller text
  annotate("text", x=legend_x_A, y=legend_y_top_A, label=min_max_text_A,
           hjust=0, color="black", size=3) +
  scale_color_manual(values=c("Mean"="#00468B","Median"="darkred","SD"="gray")) +
  labs(title="A: Including Outliers", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_A) +
  theme(aspect.ratio=0.9)

# -----------------------------
# PANEL B: Density After
# -----------------------------
mean_val_B   <- mean(before_no_outliers$HOMA, na.rm = TRUE)
median_val_B <- median(before_no_outliers$HOMA, na.rm = TRUE)
sd_val_B     <- sd(before_no_outliers$HOMA, na.rm = TRUE)
min_val_B    <- min(before_no_outliers$HOMA, na.rm = TRUE)
max_val_B    <- max(before_no_outliers$HOMA, na.rm = TRUE)

dens_B <- density(before_no_outliers$HOMA, na.rm = TRUE)
ymax_plot_B <- max(dens_B$y)*1.05
legend_x_B <- min_val_B + 0.3*(max_val_B - min_val_B)
symbol_y_B <- seq(ymax_plot_B - 0.08*ymax_plot_B, ymax_plot_B - 0.22*ymax_plot_B, length.out = 3)
legend_y_top_B <- ymax_plot_B - 0.02*ymax_plot_B
legend_df_B <- data.frame(
  x=rep(legend_x_B,3),
  y=symbol_y_B,
  label=c("Mean","Median","SD"),
  text=c(paste0("Mean: ", round(mean_val_B,2)),
         paste0("Median: ", round(median_val_B,2)),
         paste0("SD: ", round(sd_val_B,2)))
)
min_max_text_B <- paste0("Min: ", round(min_val_B,2), " | Max: ", round(max_val_B,2))

p_density_B <- ggplot(before_no_outliers, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_B, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_B, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_B + sd_val_B, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_B - sd_val_B, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_B, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_B, aes(x=x + 0.02*(max(before_no_outliers$HOMA)-min(before_no_outliers$HOMA)),
                                  y=y, label=text),
            hjust=0, color="black", size=3) +  # slightly smaller text
  annotate("text", x=legend_x_B, y=legend_y_top_B, label=min_max_text_B,
           hjust=0, color="black", size=3) +
    scale_color_manual(values=c("Mean"="#00468B","Median"="darkred","SD"="gray")) +
  labs(title="B: Post Outlier Removal", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_B) +
  theme(aspect.ratio=0.9)


# -----------------------------
# PANEL C: Boxplot Before
# -----------------------------
line_3sd <- median(before$HOMA, na.rm = TRUE) + 3*sd(before$HOMA, na.rm = TRUE)

# Add a color column: dark red if > median + 3 SD, else default
before <- before %>%
  mutate(color_flag = ifelse(HOMA > line_3sd, "above_3SD", "normal"))

# Plot
p_A <- ggplot(before, aes(x = "", y = HOMA)) +
  geom_boxplot(width = 0.4, fill = "#A5D8FF", alpha = 0.7, outlier.shape = NA) +
  geom_jitter(aes(color = color_flag), width = 0.1, size = 2, alpha = 0.6) +
  scale_color_manual(values = c("normal" = "#1B6CA8", "above_3SD" = "darkred")) +
  geom_hline(yintercept = line_3sd, linetype = "dashed", color = "gray", size = 0.7) +
  annotate("text", x = 1.2, y = line_3sd + 2, 
           label = paste0("Med. + 3SD = ", round(line_3sd,2)),
           hjust = 0, color = "gray20", size = 3) +
  labs(title = "C: Including Outliers", x = "", y = "HOMA-Insulin") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0, size = 12),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none") +
  theme(aspect.ratio = 0.9)

# -----------------------------
# PANEL D: Boxplot After
# -----------------------------
p_B <- ggplot(before_no_outliers, aes(x="", y=HOMA)) +
  geom_boxplot(width=0.4, fill="#A5D8FF", alpha=0.7, outlier.shape=NA) +
  geom_jitter(width=0.1, size=2, color="#1B6CA8", alpha=0.4) +
  labs(title="D: Post Outlier Removal", x="", y="HOMA-Insulin") +
  theme_classic(base_size=10) +
  theme(plot.title=element_text(face="bold", hjust=0, size=12),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(aspect.ratio=0.9)

# -----------------------------
# Combine 4 plots in 2x2 grid
# -----------------------------
plot_grid(p_density_A, p_density_B,
          p_A, p_B,
          ncol=2, align="hv")

2.2.3 3. Categorical HOMA-IR (DV)

To binarize HOMA-IR, we used > 2.5 as a cut-off (based on NHANES and U.S. Adult Studies).

# -----------------------------
# Prepare data
# -----------------------------
before <- data.frame(HOMA = df_ucsd_step5$HOMA_insulin)
before_no_outliers <- before %>%
  filter(HOMA >= mean(HOMA, na.rm = TRUE) - 3*sd(HOMA, na.rm = TRUE),
         HOMA <= mean(HOMA, na.rm = TRUE) + 3*sd(HOMA, na.rm = TRUE))

# -----------------------------
# PANEL A: Density Before
# -----------------------------
mean_val_A   <- mean(before$HOMA, na.rm = TRUE)
median_val_A <- median(before$HOMA, na.rm = TRUE)
sd_val_A     <- sd(before$HOMA, na.rm = TRUE)
min_val_A    <- min(before$HOMA, na.rm = TRUE)
max_val_A    <- max(before$HOMA, na.rm = TRUE)

dens_A <- density(before$HOMA, na.rm = TRUE)
ymax_plot_A <- max(dens_A$y)*1.05
legend_x_A <- min_val_A + 0.3*(max_val_A - min_val_A)
symbol_y_A <- seq(ymax_plot_A - 0.08*ymax_plot_A, ymax_plot_A - 0.22*ymax_plot_A, length.out = 3)
legend_y_top_A <- ymax_plot_A - 0.02*ymax_plot_A
legend_df_A <- data.frame(
  x = rep(legend_x_A, 3),
  y = symbol_y_A,
  label = c("Mean","Median","SD"),
  text = c(paste0("Mean: ", round(mean_val_A,2)),
           paste0("Median: ", round(median_val_A,2)),
           paste0("SD: ", round(sd_val_A,2)))
)
min_max_text_A <- paste0("Min: ", round(min_val_A,2), " | Max: ", round(max_val_A,2))

p_density_A <- ggplot(before, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_A, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_A, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_A + sd_val_A, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_A - sd_val_A, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_A, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_A, aes(x=x + 0.02*(max(before$HOMA)-min(before$HOMA)), y=y, label=text),
            hjust=0, color="black", size=3) +
  annotate("text", x=legend_x_A, y=legend_y_top_A, label=min_max_text_A,
           hjust=0, color="black", size=3) +
  scale_color_manual(values=c("Mean"="darkred","Median"="#005b96","SD"="#011f4b")) +
  labs(title="A: Including Outliers", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_A) +
  theme(aspect.ratio=0.9)

# -----------------------------
# PANEL B: Density After
# -----------------------------
mean_val_B   <- mean(before_no_outliers$HOMA, na.rm = TRUE)
median_val_B <- median(before_no_outliers$HOMA, na.rm = TRUE)
sd_val_B     <- sd(before_no_outliers$HOMA, na.rm = TRUE)
min_val_B    <- min(before_no_outliers$HOMA, na.rm = TRUE)
max_val_B    <- max(before_no_outliers$HOMA, na.rm = TRUE)

dens_B <- density(before_no_outliers$HOMA, na.rm = TRUE)
ymax_plot_B <- max(dens_B$y)*1.05
legend_x_B <- min_val_B + 0.3*(max_val_B - min_val_B)
symbol_y_B <- seq(ymax_plot_B - 0.08*ymax_plot_B, ymax_plot_B - 0.22*ymax_plot_B, length.out = 3)
legend_y_top_B <- ymax_plot_B - 0.02*ymax_plot_B
legend_df_B <- data.frame(
  x=rep(legend_x_B,3),
  y=symbol_y_B,
  label=c("Mean","Median","SD"),
  text=c(paste0("Mean: ", round(mean_val_B,2)),
         paste0("Median: ", round(median_val_B,2)),
         paste0("SD: ", round(sd_val_B,2)))
)
min_max_text_B <- paste0("Min: ", round(min_val_B,2), " | Max: ", round(max_val_B,2))

p_density_B <- ggplot(before_no_outliers, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_B, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_B, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_B + sd_val_B, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_B - sd_val_B, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_B, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_B, aes(x=x + 0.02*(max(before_no_outliers$HOMA)-min(before_no_outliers$HOMA)),
                                  y=y, label=text),
            hjust=0, color="black", size=3) +
  annotate("text", x=legend_x_B, y=legend_y_top_B, label=min_max_text_B,
           hjust=0, color="black", size=3) +
  scale_color_manual(values=c("Mean"="darkred","Median"="#005b96","SD"="#011f4b")) +
  labs(title="B: Post Outlier Removal", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_B) +
  theme(aspect.ratio=0.9)

# -----------------------------
# PANEL C: Boxplot Before
# -----------------------------
line_3sd <- median(before$HOMA, na.rm = TRUE) + 3*sd(before$HOMA, na.rm = TRUE)

before <- before %>%
  mutate(color_flag = ifelse(HOMA > line_3sd, "above_3SD", "normal"))

p_A <- ggplot(before, aes(x = "", y = HOMA)) +
  geom_boxplot(width = 0.4, fill = "#A5D8FF", alpha = 0.7, outlier.shape = NA) +
  geom_jitter(aes(color = color_flag), width = 0.1, size = 2, alpha = 0.6) +
  scale_color_manual(values = c("normal" = "#1B6CA8", "above_3SD" = "darkred")) +
  geom_hline(yintercept = line_3sd, linetype = "dashed", color = "gray", size = 0.7) +
  annotate("text", x = 1.2, y = line_3sd + 2, 
           label = paste0("Med. + 3SD = ", round(line_3sd,2)),
           hjust = 0, color = "gray20", size = 3) +
  labs(title = "C: Including Outliers", x = "", y = "HOMA-Insulin") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0, size = 12),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none") +
  theme(aspect.ratio = 0.9)

# -----------------------------
# PANEL D: Boxplot After
# -----------------------------
p_B <- ggplot(before_no_outliers, aes(x="", y=HOMA)) +
  geom_boxplot(width=0.4, fill="#A5D8FF", alpha=0.7, outlier.shape=NA) +
  geom_jitter(width=0.1, size=2, color="#1B6CA8", alpha=0.4) +
  labs(title="D: Post Outlier Removal", x="", y="HOMA-Insulin") +
  theme_classic(base_size=10) +
  theme(plot.title=element_text(face="bold", hjust=0, size=12),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(aspect.ratio=0.9)

# -----------------------------
# Combine 4 plots in 2x2 grid
# -----------------------------
# --- Top row: density plots ---
top_row <- cowplot::plot_grid(p_density_A, p_density_B, ncol = 2, labels = c("A", "B"))
# --- Bottom row: HOMA plots ---
bottom_row <- cowplot::plot_grid(p_A, p_B, ncol = 2, labels = c("C", "D"))
# --- Combine top and bottom rows vertically ---
combined_plot <- cowplot::plot_grid(top_row, bottom_row, ncol = 1, align = "v")
combined_plot

2.2.4 4. HOMA-IR- HbA1c comparison

# ============================================================
# Load libraries
# ============================================================
# ============================================================
# Shared Colors
# ============================================================
colors_homa <- c("Non-IR" = "#1B6CA8", "IR" = "#953553")

colors_hba1c <- c(
  "Normoglycemic" = "#1B6CA8",
  "Prediabetic"   = "#E69F00",
  "T2 Diabetic"   = "#953553"
)

# ============================================================
# -------------------- Study 1: HOMA-IR -----------------------
# ============================================================
df_f <- df_final %>%
  mutate(
    IR_status = ifelse(HOMA_insulin >= 2.5, 1, 0),
    IR_label  = ifelse(IR_status == 1, "IR", "Non-IR")
  )

counts_homa <- df_f %>% group_by(IR_label) %>% summarise(n = n())
legend_labels_homa <- paste0(counts_homa$IR_label, ": ", counts_homa$n)
names(legend_labels_homa) <- counts_homa$IR_label

p_homa <- ggplot(df_f, aes(x = "", y = HOMA_insulin, color = IR_label)) +
  geom_jitter(width = 0.1, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 2.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 1: HOMA-IR ≥ 2.5", y = "HOMA-IR", x = "", color = "IR Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 0.9) +
  scale_color_manual(values = colors_homa, labels = legend_labels_homa)

# ============================================================
# -------------------- Study 2: HOMA-IR -----------------------
# ============================================================
df_s2 <- df_ucsd_final %>%
  mutate(
    IR_status = ifelse(HOMA_insulin >= 2.5, 1, 0),
    IR_label  = ifelse(IR_status == 1, "IR", "Non-IR")
  )

counts_homa_s2 <- df_s2 %>% group_by(IR_label) %>% summarise(n = n())
legend_labels_homa_s2 <- paste0(counts_homa_s2$IR_label, ": ", counts_homa_s2$n)
names(legend_labels_homa_s2) <- counts_homa_s2$IR_label

p_homa_s2 <- ggplot(df_s2, aes(x = "", y = HOMA_insulin, color = IR_label)) +
  geom_jitter(width = 0.1, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 2.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 2: HOMA-IR ≥ 2.5", y = "HOMA-IR", x = "", color = "IR Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 0.9) +
  scale_color_manual(values = colors_homa, labels = legend_labels_homa_s2)

# ============================================================
# -------------------- Study 1: HbA1c -------------------------
# ============================================================
df_final <- df_final %>%
  mutate(
    HbA1c_value = HbA1c....,
    Glycemic_label = case_when(
      HbA1c_value < 5.7 ~ "Normoglycemic",
      HbA1c_value >= 5.7 & HbA1c_value < 6.5 ~ "Prediabetic",
      HbA1c_value >= 6.5 ~ "T2 Diabetic"
    )
  )

counts_hba1c <- df_final %>% group_by(Glycemic_label) %>% summarise(n = n())
legend_labels_hba1c <- paste0(counts_hba1c$Glycemic_label, ": ", counts_hba1c$n)
names(legend_labels_hba1c) <- counts_hba1c$Glycemic_label

p_hba1c <- ggplot(df_final, aes(x = "", y = HbA1c_value, color = Glycemic_label)) +
  geom_jitter(width = 0.12, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 5.7, linetype = "dashed", color = "gray40", size = 1) +
  geom_hline(yintercept = 6.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 1: HbA1c Classification", y = "HbA1c (%)", x = "", color = "Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 1) +
  scale_color_manual(values = colors_hba1c, labels = legend_labels_hba1c)

# ============================================================
# -------------------- Study 2: HbA1c -------------------------
# ============================================================
df_s2 <- df_ucsd_final %>%
  mutate(
    HbA1c_value = HbA1c....,
    Glycemic_label = case_when(
      HbA1c_value < 5.7 ~ "Normoglycemic",
      HbA1c_value >= 5.7 & HbA1c_value < 6.5 ~ "Prediabetic",
      HbA1c_value >= 6.5 ~ "T2 Diabetic"
    )
  )

counts_hba1c_s2 <- df_s2 %>% group_by(Glycemic_label) %>% summarise(n = n())
legend_labels_hba1c_s2 <- paste0(counts_hba1c_s2$Glycemic_label, ": ", counts_hba1c_s2$n)
names(legend_labels_hba1c_s2) <- counts_hba1c_s2$Glycemic_label

p_hba1c_s2 <- ggplot(df_s2, aes(x = "", y = HbA1c_value, color = Glycemic_label)) +
  geom_jitter(width = 0.12, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 5.7, linetype = "dashed", color = "gray40", size = 1) +
  geom_hline(yintercept = 6.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 2: HbA1c Classification", y = "HbA1c (%)", x = "", color = "Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 1) +
  scale_color_manual(values = colors_hba1c, labels = legend_labels_hba1c_s2)

# ============================================================
# -------------------- Combine All Plots ----------------------
# ============================================================

# Row 1: HOMA-IR (Study 1 vs Study 2)
row1 <- p_homa | p_homa_s2

# Row 2: HbA1c (Study 1 vs Study 2)
row2 <- p_hba1c | p_hba1c_s2

# Full figure
full_plot <- row1 / row2

full_plot

# ============================================================
# Function to summarize HbA1c per IR group with range
# ============================================================
summarize_hba1c_IR <- function(df, study_name) {
  
  df <- df %>%
    mutate(
      HbA1c_value = as.numeric(HbA1c....),
      IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR")
    )
  
  na_count <- sum(is.na(df$HbA1c_value))
  df_clean <- df %>% filter(!is.na(HbA1c_value))
  
  descriptives <- df_clean %>%
    group_by(IR_label) %>%
    summarise(
      N = n(),
      Mean = round(mean(HbA1c_value), 2),
      SD = round(sd(HbA1c_value), 2),
      Median = round(median(HbA1c_value), 2),
      Range = paste0(round(min(HbA1c_value),2), "–", round(max(HbA1c_value),2)),
      .groups = "drop"
    ) %>%
    mutate(Study = study_name,
           N_Missing = na_count) %>%
    select(Study, IR_label, N, N_Missing, Mean, SD, Median, Range)
  
  # T-test (only if 2 groups)
  if(length(unique(df_clean$IR_label)) == 2){
    t_res <- t.test(HbA1c_value ~ IR_label, data = df_clean)
    t_test <- tibble(
      Study = study_name,
      t = round(t_res$statistic, 2),
      df = round(t_res$parameter, 1),
      p = signif(t_res$p.value, 3)
    )
  } else {
    t_test <- tibble(
      Study = study_name,
      t = NA, df = NA, p = NA
    )
  }
  
  list(descriptives = descriptives, t_test = t_test)
}

# ============================================================
# Summarize studies
# ============================================================
summary_s1 <- summarize_hba1c_IR(df_final, "Study 1")
summary_s2 <- summarize_hba1c_IR(df_ucsd_final, "Study 2")

descriptives_table <- bind_rows(summary_s1$descriptives, summary_s2$descriptives)
t_tests_table <- bind_rows(summary_s1$t_test, summary_s2$t_test)

# ============================================================
# Merge for display, only show t-test once per study
# ============================================================
final_table <- descriptives_table %>%
  left_join(t_tests_table, by = "Study") %>%
  arrange(Study, IR_label)

# Remove t-test values for second row of each study to only display once
final_table <- final_table %>%
  group_by(Study) %>%
  mutate(
    t = ifelse(row_number() == 1, t, NA),
    df = ifelse(row_number() == 1, df, NA),
    p = ifelse(row_number() == 1, p, NA)
  ) %>%
  ungroup()

# ============================================================
# Display as HTML kable
# ============================================================
kable(final_table, "html", caption = "HbA1c Summary by IR Status") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = T, font_size = 14) %>%
  row_spec(0, bold = TRUE, background = "#F5FBFF") %>%
  collapse_rows(columns = 1, valign = "top") %>%
  add_header_above(c(" " = 2, "Descriptive Statistics" = 6, "T-Test (Non-IR vs IR)" = 3))

HbA1c Summary by IR Status
		Descriptive Statistics						T-Test (Non-IR vs IR)
Study	IR_label	N	N_Missing	Mean	SD	Median	Range	t	df	p
Study 1	IR	61	1	6.35	1.07	6.1	4.5–10.7	4.71	86.8	9.3e-06
Study 1	Non-IR	76	1	5.63	0.57	5.5	4.6–7.4	NA	NA	NA
Study 2	IR	26	0	6.22	0.67	6.1	5.2–7.7	3.21	41.2	2.6e-03
Study 2	Non-IR	30	0	5.73	0.43	5.7	4.9–6.6	NA	NA	NA

3 Descriptives

3.1 Clinical Visit

3.1.1 Study 1

# ============================================================
# 1. Create IR label
# ============================================================
df_final <- df_final %>%
  mutate(IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR"))

# ============================================================
# 2. Original variables to plot
# ============================================================
vars_to_plot <- c(
  "age",
  "weight_vsorres..Weight..kilograms.",
  "bmi_vsorres..BMI",
  "hip_vsorres..Hip.Circumference..cm.",
  "waist_vsorres..Waist.Circumference..cm.",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR.",
  "pulse_vsorres..Heart.Rate..bpm.",
  "bp1_diabp_vsorres..Diastolic..mmHg.",
  "bp2_diabp_vsorres..Diastolic..mmHg.",
  "bp1_sysbp_vsorres..Systolic..mmHg.",
  "bp2_sysbp_vsorres..Systolic..mmHg.",
  "clinical_site"
)

# Keep only existing columns
vars_to_plot <- vars_to_plot[vars_to_plot %in% names(df_final)]

# ============================================================
# 3. Short names for titles
# ============================================================
short_names <- c(
  "bmi_vsorres..BMI"                         = "BMI",
  "bp1_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP1",
  "bp2_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP2",
  "bp1_sysbp_vsorres..Systolic..mmHg."     = "Systolic_BP1",
  "bp2_sysbp_vsorres..Systolic..mmHg."     = "Systolic_BP2",
  "pulse_vsorres..Heart.Rate..bpm."         = "PulseHR",
  "waist_vsorres..Waist.Circumference..cm." = "Waist_Circumference",
  "hip_vsorres..Hip.Circumference..cm."     = "Hip_Circumference",
  "weight_vsorres..Weight..kilograms."      = "Weight (kg)",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR."   = "WaistHip_Ratio",
  "age"                                     = "Age",
  "clinical_site"                           = "Clinical_Site"
)

# Only keep short names for variables present
short_names <- short_names[names(short_names) %in% vars_to_plot]

# ============================================================
# 4. Colors
# ============================================================
fill_colors <- c("Non-IR" = "#A5D8FF", "IR" = "#FFB6C1")
edge_colors <- c("Non-IR" = "#1B6CA8", "IR" = "#C71585")

# ============================================================
# 5. Plotting function with p-value on the right
# ============================================================
plot_ir_variable <- function(var_name, data, short_names, fill_colors, edge_colors) {
  
  if (!var_name %in% names(data)) return(NULL)
  
  plot_data <- data %>% filter(!is.na(.data[[var_name]]))
  if (nrow(plot_data) == 0) return(NULL)
  
  plot_title <- ifelse(var_name %in% names(short_names), short_names[[var_name]], var_name)
  x <- plot_data[[var_name]]
  
  # Compute p-value
  p_val <- if (is.numeric(x)) {
    tryCatch(t.test(x ~ IR_label, data = plot_data)$p.value, error = function(e) NA)
  } else {
    tbl <- table(x, plot_data$IR_label)
    tryCatch(chisq.test(tbl)$p.value, error = function(e) NA)
  }
  
  # Format p-value as text
  p_text <- if (!is.na(p_val)) {
    if (p_val < 0.001) "p < 0.001" else paste0("p = ", signif(p_val, 3))
  } else {
    "p = NA"
  }
  
  # Numeric plot
  if (is.numeric(x)) {
    max_y <- max(x, na.rm = TRUE)
    min_y <- min(x, na.rm = TRUE)
    
    p <- ggplot(plot_data, aes(x = IR_label, y = .data[[var_name]], fill = IR_label, color = IR_label)) +
      geom_violin(alpha = 0.5, size = 0.8) +
      geom_jitter(width = 0.1, size = 1.5, alpha = 0.8) +
      scale_fill_manual(values = fill_colors) +
      scale_color_manual(values = edge_colors) +
      labs(title = plot_title, x = "IR Status", y = plot_title) +
      theme_classic(base_size = 9) +
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),    axis.text.y = element_text(face = "bold"),
            legend.position = "none",
            aspect.ratio = 1.3)
    
  } else { # Categorical plot
    p <- ggplot(plot_data, aes(x = .data[[var_name]], fill = IR_label)) +
      geom_bar(position = "dodge", color = "black") +
      scale_fill_manual(values = fill_colors) +
      labs(title = plot_title, x = plot_title, y = "Count") +
      theme_classic(base_size = 9) +
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
            axis.text.x = element_text(angle = 45, hjust = 1),
            axis.text.y = element_text(face = "bold"),
            legend.position = "none",
            aspect.ratio = 1.3)
  }
  
  # Add p-value text annotation to the right
  # p <- p + annotate("text", x = Inf, y = Inf, label = p_text,
  #                   hjust = 1, vjust = 2.5, size = 3, fontface = "italic")
  
  return(p)
  
  # x = Inf, y = Inf, 
}

# ============================================================
# 6. Generate plots
# ============================================================
plot_list <- lapply(
  vars_to_plot,
  plot_ir_variable,
  data = df_final,
  short_names = short_names,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list <- plot_list[!sapply(plot_list, is.null)]

# ============================================================
# 7. Combine into grid (4 columns)
# ============================================================
big_panel <- wrap_plots(plot_list, ncol = 4)
big_panel

# Separate numeric and categorical variables

# ============================
# Identify numeric vs categorical variables
# ============================
num_vars <- vars_to_plot[sapply(df_final[vars_to_plot], is.numeric)]
cat_vars <- vars_to_plot[sapply(df_final[vars_to_plot], function(x) !is.numeric(x))]

# ============================
# Numeric summary
# ============================
num_summary <- lapply(num_vars, function(var) {
  data <- df_final %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  # t-test
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  # format p-value
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = ifelse(length(IR_vals) > 0,
                sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)), "–"),
    Non_IR = ifelse(length(NonIR_vals) > 0,
                    sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)), "–"),
    Total = ifelse(length(Total_vals) > 0,
                   sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)), "–"),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ============================
# Categorical summary
# ============================
cat_summary <- lapply(cat_vars, function(var) {
  
  tbl <- table(df_final[[var]], df_final$IR_label)
  
  # Safe extraction of columns
  IR_col <- if ("IR" %in% colnames(tbl)) tbl[, "IR"] else rep(0, nrow(tbl))
  NonIR_col <- if ("Non-IR" %in% colnames(tbl)) tbl[, "Non-IR"] else rep(0, nrow(tbl))
  
  IR_total <- sum(IR_col)
  NonIR_total <- sum(NonIR_col)
  total_total <- sum(tbl)
  
  # chi-square test
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  # calculate percentages per column (so IR and Non-IR each sum to 100%)
  IR_str <- sprintf("%d (%.1f%%)", IR_col, 100 * IR_col / IR_total)
  NonIR_str <- sprintf("%d (%.1f%%)", NonIR_col, 100 * NonIR_col / NonIR_total)
  Total_str <- sprintf("%d (%.1f%%)", IR_col + NonIR_col, 100 * (IR_col + NonIR_col) / total_total)
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = paste(IR_str, collapse = "; "),
    Non_IR = paste(NonIR_str, collapse = "; "),
    Total = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ============================
# Combine tables
# ============================
final_table <- bind_rows(num_summary, cat_summary)

# ============================
# Add APA description row
# ============================
description_row <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD OR count (%)",
  Non_IR = "Non-IR = mean ± SD OR count (%)",
  Total = "Total = mean ± SD OR count (%)",
  `p-value` = "t-test or χ²"
)

final_table2 <- bind_rows(description_row, final_table)

# ============================
# Add significance stars + bold
# ============================
add_stars <- function(p) {
  p_num <- suppressWarnings(as.numeric(p))
  
  # Handle "<0.001"
  if (grepl("<0.001", p)) return("**<0.001** ***")
  
  # Handle non-numeric entries (e.g., "–")
  if (is.na(p_num)) return(p)
  
  # Assign stars
  if (p_num < 0.001) return("**<0.001** ***")
  if (p_num < 0.01)   return(paste0("**", p, "**", " **"))
  if (p_num < 0.05)   return(paste0("**", p, "**", " *"))
  
  return(p)
}

final_table2$`p-value` <- sapply(final_table2$`p-value`, add_stars)

# ============================
# Identify significant rows
# ============================
p_char <- final_table$`p-value`
# Remove "<" and convert to numeric
p_numeric <- as.numeric(gsub("<", "", p_char))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1  # +1 for description row

# ============================
# Render APA-style table
# ============================
kbl(final_table2, "html",
    caption = "Table 1. Comparison of clinical visit variables by IR status",  escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%   # header row
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%  # description row
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows, background = "#F5FBFF")

Table 1. Comparison of clinical visit variables by IR status
Variable	IR	Non_IR	Total	p-value
Note:	IR = mean ± SD OR count (%)	Non-IR = mean ± SD OR count (%)	Total = mean ± SD OR count (%)	t-test or χ²
Age	55.42 ± 9.13	56.62 ± 11.16	56.08 ± 10.28	0.489
Weight (kg)	99.21 ± 22.99	80.34 ± 25.54	88.82 ± 26.10	<0.001 ***
BMI	35.43 ± 8.08	29.31 ± 7.23	32.06 ± 8.19	<0.001 ***
Hip_Circumference	120.76 ± 16.65	109.22 ± 16.58	114.40 ± 17.52	<0.001 ***
Waist_Circumference	112.47 ± 14.91	95.57 ± 15.07	103.16 ± 17.16	<0.001 ***
WaistHip_Ratio	0.94 ± 0.08	0.88 ± 0.08	0.90 ± 0.09	<0.001 ***
PulseHR	72.65 ± 10.65	70.33 ± 11.62	71.37 ± 11.21	0.225
Diastolic_BP1	82.90 ± 9.92	78.05 ± 8.99	80.23 ± 9.69	0.003 **
Diastolic_BP2	79.82 ± 8.68	78.55 ± 9.57	79.12 ± 9.17	0.416
Systolic_BP1	133.00 ± 14.50	128.36 ± 14.42	130.44 ± 14.59	0.063
Systolic_BP2	130.85 ± 13.79	126.37 ± 15.87	128.38 ± 15.09	0.078
Clinical_Site	37 (59.7%); 25 (40.3%)	42 (55.3%); 34 (44.7%)	79 (57.2%); 59 (42.8%)	0.728

3.1.2 Study 2

# ============================================================
# 1. Create IR label
# ============================================================
df_ucsd_final <- df_ucsd_final %>%
  mutate(IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR"),
    clinical_site = factor(clinical_site))  # ensure proper categorical plotting

# ============================================================
# 2. Original variables to plot
# ============================================================
vars_to_plot <- c(
  "age",
  "weight_vsorres..Weight..kilograms.",
  "bmi_vsorres..BMI",
  "hip_vsorres..Hip.Circumference..cm.",
  "waist_vsorres..Waist.Circumference..cm.",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR.",
  "pulse_vsorres..Heart.Rate..bpm.",
  "bp1_diabp_vsorres..Diastolic..mmHg.",
  "bp2_diabp_vsorres..Diastolic..mmHg.",
  "bp1_sysbp_vsorres..Systolic..mmHg.",
  "bp2_sysbp_vsorres..Systolic..mmHg.",
  "clinical_site"
)

vars_to_plot <- vars_to_plot[vars_to_plot %in% names(df_ucsd_final)]

# ============================================================
# 3. Short names for titles
# ============================================================
short_names <- c(
  "bmi_vsorres..BMI"                         = "BMI",
  "bp1_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP1",
  "bp2_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP2",
  "bp1_sysbp_vsorres..Systolic..mmHg."      = "Systolic_BP1",
  "bp2_sysbp_vsorres..Systolic..mmHg."      = "Systolic_BP2",
  "pulse_vsorres..Heart.Rate..bpm."         = "PulseHR",
  "waist_vsorres..Waist.Circumference..cm." = "Waist_Circumference",
  "hip_vsorres..Hip.Circumference..cm."     = "Hip_Circumference",
  "weight_vsorres..Weight..kilograms."      = "Weight (kg)",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR."   = "WaistHip_Ratio",
  "age"                                     = "Age",
  "clinical_site"                           = "Clinical_Site"
)

short_names <- short_names[names(short_names) %in% vars_to_plot]

# ============================================================
# 4. Colors
# ============================================================
fill_colors <- c("Non-IR" = "#A5D8FF", "IR" = "#FFB6C1")
edge_colors <- c("Non-IR" = "#1B6CA8", "IR" = "#C71585")

# ============================================================
# 5. Plotting function with p-value on the right
# ============================================================
plot_ir_variable <- function(var_name, data, short_names, fill_colors, edge_colors) {
  
  if (!var_name %in% names(data)) return(NULL)
  
  plot_data <- data %>% filter(!is.na(.data[[var_name]]))
  if (nrow(plot_data) == 0) return(NULL)
  
  plot_title <- ifelse(var_name %in% names(short_names), short_names[[var_name]], var_name)
  x <- plot_data[[var_name]]
  
  # Compute p-value
  p_val <- if (is.numeric(x)) {
    tryCatch(t.test(x ~ IR_label, data = plot_data)$p.value, error = function(e) NA)
  } else {
    tbl <- table(x, plot_data$IR_label)
    tryCatch(chisq.test(tbl)$p.value, error = function(e) NA)
  }
  
  # Format p-value as text
  p_text <- if (!is.na(p_val)) {
    if (p_val < 0.001) "p < 0.001" else paste0("p = ", signif(p_val, 3))
  } else {
    "p = NA"
  }
  
  if (is.numeric(x)) {
    # Numeric variables: violin + jitter
    p <- ggplot(plot_data, aes(x = IR_label, y = .data[[var_name]], fill = IR_label, color = IR_label)) +
      geom_violin(alpha = 0.5, size = 0.8) +
      geom_jitter(width = 0.1, size = 1.5, alpha = 0.8) +
      scale_fill_manual(values = fill_colors) +
      scale_color_manual(values = edge_colors) +
      labs(title = plot_title, x = "IR Status", y = plot_title) +
      theme_classic(base_size = 9) +
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
            axis.text.y = element_text(face = "bold"),
            legend.position = "none",
            aspect.ratio = 1.3)
    
  } else {
    # Ensure factor for categorical variables
    plot_data[[var_name]] <- factor(plot_data[[var_name]], levels = unique(plot_data[[var_name]]))
    
    # Generalized categorical plot
    tbl <- plot_data %>%
      group_by(.data[[var_name]], IR_label) %>%
      summarise(count = n(), .groups = "drop") %>%
      tidyr::complete(.data[[var_name]], IR_label = c("IR", "Non-IR"), fill = list(count = 0)) %>%
      group_by(IR_label) %>%
      mutate(perc = 100 * count / sum(count)) %>%
      ungroup()
    
    # Use dodge if multiple categories, single bar if only one (like clinical_site with one site)
    if (length(unique(plot_data[[var_name]])) == 1) {
      # Single bar
      p <- ggplot(tbl, aes(x = .data[[var_name]], y = perc, fill = IR_label)) +
        geom_col(color = "black") +
        scale_fill_manual(values = fill_colors) +
        labs(title = plot_title, x = "", y = "Percent") +
        theme_classic(base_size = 9) +
        theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
              axis.text.x = element_text(face = "bold"),
              axis.text.y = element_text(face = "bold"),
              legend.position = "right",
              aspect.ratio = 1)
    } else {
      # Multiple categories
      p <- ggplot(tbl, aes(x = .data[[var_name]], y = perc, fill = IR_label)) +
        geom_col(position = "dodge", color = "black") +
        scale_fill_manual(values = fill_colors) +
        labs(title = plot_title, x = plot_title, y = "Percent") +
        theme_classic(base_size = 9) +
        theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
              axis.text.x = element_text(angle = 45, hjust = 1),
              axis.text.y = element_text(face = "bold"),
              legend.position = "none",
              aspect.ratio = 1.3)
    }
  }
  
  return(p)
}

# ============================================================
# 6. Generate plots
# ============================================================
plot_list <- lapply(
  vars_to_plot,
  plot_ir_variable,
  data = df_ucsd_final,
  short_names = short_names,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list <- plot_list[!sapply(plot_list, is.null)]
big_panel <- wrap_plots(plot_list, ncol = 4)
big_panel

# ============================================================
# 7. Numeric and categorical summaries
# ============================================================
# ============================================================
# 7. Numeric and categorical summaries
# ============================================================

num_vars <- vars_to_plot[sapply(df_ucsd_final[vars_to_plot], is.numeric)]
cat_vars <- vars_to_plot[sapply(df_ucsd_final[vars_to_plot], function(x) !is.numeric(x))]

# -----------------------------
# Numeric summary
# -----------------------------
num_summary <- lapply(num_vars, function(var) {
  data <- df_ucsd_final %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = ifelse(length(IR_vals) > 0, sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)), "–"),
    Non_IR = ifelse(length(NonIR_vals) > 0, sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)), "–"),
    Total = ifelse(length(Total_vals) > 0, sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)), "–"),
    `p-value` = p_val
  )
}) %>% bind_rows()

# -----------------------------
# Categorical summary (fixed)
# -----------------------------
cat_summary <- lapply(cat_vars, function(var) {
  
  tbl <- table(df_ucsd_final[[var]], df_ucsd_final$IR_label)
  
  # Safe extraction
  IR_col <- if ("IR" %in% colnames(tbl)) tbl[, "IR"] else rep(0, nrow(tbl))
  NonIR_col <- if ("Non-IR" %in% colnames(tbl)) tbl[, "Non-IR"] else rep(0, nrow(tbl))
  total_col <- IR_col + NonIR_col
  total_total <- sum(total_col)
  
  # Chi-square test
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  # Percent of **total subjects** per group
  IR_str <- sprintf("%d (%.1f%%)", IR_col, 100 * IR_col / total_total)
  NonIR_str <- sprintf("%d (%.1f%%)", NonIR_col, 100 * NonIR_col / total_total)
  Total_str <- sprintf("%d (%.1f%%)", total_col, 100 * total_col / total_total)
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = paste(IR_str, collapse = "; "),
    Non_IR = paste(NonIR_str, collapse = "; "),
    Total = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

# -----------------------------
# Combine numeric + categorical
# -----------------------------
final_table <- bind_rows(num_summary, cat_summary)

# Description row
description_row <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD OR count (%)",
  Non_IR = "Non-IR = mean ± SD OR count (%)",
  Total = "Total = mean ± SD OR count (%)",
  `p-value` = "t-test or χ²"
)

final_table2 <- bind_rows(description_row, final_table)

# -----------------------------
# Add significance stars
# -----------------------------
add_stars <- function(p) {
  p_num <- suppressWarnings(as.numeric(p))
  if (grepl("<0.001", p)) return("**<0.001** ***")
  if (is.na(p_num)) return(p)
  if (p_num < 0.001) return("**<0.001** ***")
  if (p_num < 0.01)   return(paste0("**", p, "** **"))
  if (p_num < 0.05)   return(paste0("**", p, "** *"))
  return(p)
}

final_table2$`p-value` <- sapply(final_table2$`p-value`, add_stars)

# -----------------------------
# Highlight significant rows
# -----------------------------
p_char <- final_table$`p-value`
p_numeric <- suppressWarnings(as.numeric(gsub("<|\\*|\\s", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1  # +1 for description row

# -----------------------------
# Render table
# -----------------------------
kbl(final_table2, "html",
    caption = "Table 1. Comparison of variables by IR status (Study 2)",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows, background = "#F5FBFF")

Table 1. Comparison of variables by IR status (Study 2)
Variable	IR	Non_IR	Total	p-value
Note:	IR = mean ± SD OR count (%)	Non-IR = mean ± SD OR count (%)	Total = mean ± SD OR count (%)	t-test or χ²
Age	61.27 ± 10.83	61.83 ± 9.68	61.57 ± 10.14	0.839
Weight (kg)	95.39 ± 19.94	75.18 ± 15.58	84.56 ± 20.30	<0.001 ***
BMI	34.31 ± 8.48	26.82 ± 4.65	30.30 ± 7.64	<0.001 ***
Hip_Circumference	115.88 ± 19.46	101.72 ± 11.46	108.29 ± 17.09	0.002 **
Waist_Circumference	110.62 ± 13.45	91.81 ± 14.72	100.54 ± 16.92	<0.001 ***
WaistHip_Ratio	0.96 ± 0.08	0.90 ± 0.10	0.93 ± 0.09	0.011 *
PulseHR	67.12 ± 10.84	62.73 ± 7.28	64.77 ± 9.29	0.088
Diastolic_BP1	80.35 ± 10.76	76.93 ± 13.14	78.52 ± 12.11	0.290
Diastolic_BP2	78.23 ± 12.48	75.07 ± 11.10	76.54 ± 11.76	0.324
Systolic_BP1	129.46 ± 17.74	132.03 ± 26.06	130.84 ± 22.42	0.664
Systolic_BP2	127.88 ± 19.29	131.27 ± 22.23	129.70 ± 20.80	0.545
Clinical_Site	26 (46.4%)	30 (53.6%)	56 (100.0%)	0.593

3.2 Demographics

3.2.1 Study 1

### ======================================================================
### LOAD & CLEAN DATA
### ======================================================================

### Load “other” demographic variables
other_vars <- read.csv("~/all_vars.csv") %>% 
  select(
    fh_dm2sb, years_of_education, sualcage, person_id, fh_dm2pt,
    sualckncf, suwndosfr, suwnkncf, subrkncf, subrdosfr, sulqdosfr, sulqkncf, pxhic1
  ) %>% 
  mutate(across(everything(), ~na_if(., 777)))

other_vars$participant_id <- other_vars$person_id


### Merge with df_final
df_f <- merge(other_vars, df_final, by = "participant_id")


### Load additional datasets
sleep <- read.csv("~/Downloads/sleep_participant_summary.csv")
steps <- read.csv("~/Downloads/steps_participant_summary.csv")

diet <- read.csv("~/Downloads/diet_data.csv") %>%
  mutate(across(everything(), ~na_if(., 777)))

hrv <- read.csv("~/Downloads/HRV_Joe.csv") %>%
  select(hr_from_ecg, hrv_rmssd, subject_id)

all_stress <- read.csv("~/Downloads/stress_overall_daily.csv")

survey <- read.csv("~/Downloads/survey_data.csv") %>%
  select(
    person_id,
    dmlpor..When.reflecting.on.your.typical.eating.ha,
    dmlfrveg..In.a.typical.day..how.many.servings.of.,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlact..What.would.you.consider.your.typical.acti
  ) %>%
  mutate(across(everything(), ~na_if(., 777)))


### Merge all datasets
df_test <- df_final %>%
  left_join(other_vars, by = "participant_id") %>%
  left_join(sleep, by = "participant_id") %>%
  left_join(steps, by = "participant_id") %>%
  left_join(diet, by = c("participant_id" = "person_id")) %>%
  left_join(hrv, by = c("participant_id" = "subject_id")) %>%
  left_join(all_stress, by = c("participant_id" = "id")) %>%
  left_join(survey, by = c("participant_id" = "person_id"))

df_test <- df_test %>% group_by(participant_id) %>% slice(1) %>% ungroup()



### ======================================================================
### RECODE VARIABLES
### ======================================================================

df_test$pxhic1_cat <- factor(
  ifelse(df_test$pxhic1 == 1, "yes",
  ifelse(df_test$pxhic1 == 0, "no", NA)),
  levels = c("no", "yes")
)

df_test$suwnkncf_cat <- factor(
  ifelse(df_test$suwnkncf == 1, "yes",
  ifelse(df_test$suwnkncf == 0, "no", NA)),
  levels = c("no", "yes")
)

df_test$sulqkncf_cat <- factor(
  ifelse(df_test$sulqkncf == 1, "yes",
  ifelse(df_test$sulqkncf == 0, "no",
  ifelse(df_test$sulqkncf == 777, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_test$fh_dm2pt_cat <- factor(
  ifelse(df_test$fh_dm2pt == 1, "yes",
  ifelse(df_test$fh_dm2pt == 0, "no",
  ifelse(df_test$fh_dm2pt == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_test$fh_dm2sb_cat <- factor(
  ifelse(df_test$fh_dm2sb == 1, "yes",
  ifelse(df_test$fh_dm2sb == 0, "no",
  ifelse(df_test$fh_dm2sb == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_test$fh_dm2_combined <- ifelse(
  df_test$fh_dm2pt == 555 | df_test$fh_dm2sb == 555, "prefer not answer",
  ifelse(df_test$fh_dm2pt == 1 | df_test$fh_dm2sb == 1, "yes",
  ifelse(df_test$fh_dm2pt == 0 & df_test$fh_dm2sb == 0, "no", NA))
)

df_test$fh_dm2_combined <- factor(
  df_test$fh_dm2_combined,
  levels = c("no", "yes", "prefer not answer")
)



### ======================================================================
### DEFINE short_names2 (FULL + COMPLETE)
### ======================================================================

short_names2 <- c(
  "pxhic1_cat" = "Insurance",
  "sualcage" = "Total years drinking",
  "suwndosfr" = "Wine (weekly)",
  "suwnkncf" = "Wine in past year",
  "sulqkncf" = "Liquor in past year",
  "sulqdosfr" = "Liquor (weekly)",
  "subrkncf" = "Beer/Ale in past year",
  "subrdosfr" = "Beer/Ale (weekly)",
  "fh_dm2_combined" = "Family T2D history (any)",
  "fh_dm2pt_cat" = "Parent T2D history",
  "fh_dm2sb_cat" = "Sibling T2D history",
  "years_of_education" = "Years of education"
)

### ======================================================================
### SPLIT INTO TWO VARIABLE SETS
### ======================================================================

vars_demo <- c(
  "pxhic1_cat",
  "fh_dm2_combined",
  "fh_dm2pt_cat",
  "fh_dm2sb_cat",
  "years_of_education"
)

vars_alcohol <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

short_demo    <- short_names2[names(short_names2) %in% vars_demo]
short_alcohol <- short_names2[names(short_names2) %in% vars_alcohol]

### ======================================================================
### PLOT PANELS
### (Assumes you already defined plot_ir_variable() earlier)
### ======================================================================

plot_list_demo <- lapply(
  vars_demo,
  plot_ir_variable,
  data = df_test,
  short_names = short_demo,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_demo <- plot_list_demo[!sapply(plot_list_demo, is.null)]
big_panel_demo <- wrap_plots(plot_list_demo, ncol = 5)
big_panel_demo

### ======================================================================
### SUMMARY TABLE FUNCTION
### ======================================================================

add_stars <- function(p) {
  if (p == "–") return("–")
  pnum <- suppressWarnings(as.numeric(gsub("<", "", p)))
  if (is.na(pnum)) return(p)
  if (pnum < 0.001) return("**<0.001**")
  if (pnum < 0.01) return(paste0(p, " **"))
  if (pnum < 0.05) return(paste0(p, " *"))
  return(p)
}


create_summary_table <- function(var_list, short_names, df) {
  
  num_vars <- var_list[sapply(df[var_list], is.numeric)]
  cat_vars <- var_list[sapply(df[var_list], function(x) !is.numeric(x))]
  
  num_summary <- lapply(num_vars, function(var) {
    data <- df %>% filter(!is.na(.data[[var]]))
    IR_vals    <- data %>% filter(IR_label == "IR") %>% pull(var)
    NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
    Total_vals <- data[[var]]
    
    t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
    p_raw <- if (!is.null(t_res)) t_res$p.value else NA
    
    p_val <- if (is.na(p_raw)) "–"
      else if (p_raw < 0.001) "<0.001"
      else formatC(p_raw, digits = 3, format = "f")
    
    tibble(
      Variable = short_names[[var]],
      IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
      Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
      Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  
  cat_summary <- lapply(cat_vars, function(var) {
    tbl <- table(df[[var]], df$IR_label)
    IR_total    <- sum(tbl[, "IR"])
    NonIR_total <- sum(tbl[, "Non-IR"])
    total_total <- sum(tbl)
    
    chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
    p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
    
    p_val <- if (is.na(p_raw)) "–"
      else if (p_raw < 0.001) "<0.001"
      else formatC(p_raw, digits = 3, format = "f")
    
    IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
    NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
    Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
    
    tibble(
      Variable = short_names[[var]],
      IR = paste(IR_str, collapse = "; "),
      Non_IR = paste(NonIR_str, collapse = "; "),
      Total = paste(Total_str, collapse = "; "),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  
  final_table <- bind_rows(num_summary, cat_summary)
  
  missing_summary <- sapply(var_list, function(v) {
    n_miss <- sum(is.na(df[[v]]))
    pct <- round(100 * n_miss / nrow(df), 1)
    sprintf("%d (%.1f%%)", n_miss, pct)
  })
  
  final_table$`Missing (n, %)` <- missing_summary
  
  note_row <- tibble(
    Variable = "Note:",
    IR = "IR = mean ± SD or count (%)",
    Non_IR = "Non-IR = mean ± SD or count (%)",
    Total = "Total = mean ± SD or count (%)",
    `p-value` = "t-test or χ² test",
    `Missing (n, %)` = ""
  )
  
  final_table <- bind_rows(note_row, final_table)
  
  final_table$`p-value` <- sapply(final_table$`p-value`, add_stars)
  
  final_table
}

### ======================================================================
### GENERATE DEMOGRAPHIC TABLE
### ======================================================================

table_demo <- create_summary_table(vars_demo, short_demo, df_test)

# Identify significant rows
p_char <- table_demo$`p-value`[-1] # skip Note row
p_numeric <- suppressWarnings(as.numeric(gsub("[*<]", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1  # +1 for Note row

kbl(table_demo, "html",
    caption = "Study 1. Demographic Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%                          # Bold header
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>% # Italic Note row with background
  column_spec(1, bold = TRUE) %>%                       # Bold Variable column
  column_spec(5, italic = TRUE) %>%                     # Italic p-value column
  row_spec(sig_rows, background = "#F5FBFF")

Study 1. Demographic Variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test
Years of education	15.13 ± 3.78	16.17 ± 3.67	15.70 ± 3.74	0.105	25 (18.1%)
Insurance	13 (24.5%); 47 (88.7%)	24 (40.0%); 29 (48.3%)	37 (32.7%); 76 (67.3%)	0.014 *	0 (0.0%)
Family T2D history (any)	30 (48.4%); 37 (59.7%); 9 (14.5%)	18 (23.7%); 28 (36.8%); 16 (21.1%)	48 (34.8%); 65 (47.1%); 25 (18.1%)	0.089	0 (0.0%)
Parent T2D history	37 (59.7%); 33 (53.2%); 6 (9.7%)	23 (30.3%); 30 (39.5%); 9 (11.8%)	60 (43.5%); 63 (45.7%); 15 (10.9%)	0.270	0 (0.0%)
Sibling T2D history	55 (88.7%); 16 (25.8%); 5 (8.1%)	35 (46.1%); 14 (18.4%); 13 (17.1%)	90 (65.2%); 30 (21.7%); 18 (13.0%)	0.034 *	0 (0.0%)

3.2.2 Study 2

### ======================================================================
### STUDY 2: LOAD & CLEAN DATA
### ======================================================================
# Load “other” demographic variables
other_vars2 <- read.csv("~/all_vars.csv") %>% 
  select(
    fh_dm2sb, years_of_education, sualcage, person_id, fh_dm2pt,
    sualckncf, suwndosfr, suwnkncf, subrkncf, subrdosfr, sulqdosfr, sulqkncf, pxhic1
  ) %>% 
  mutate(across(everything(), ~na_if(., 777)))

other_vars2$participant_id <- other_vars2$person_id

# Merge with df_ucsd_final
df_s2 <- merge(other_vars2, df_ucsd_final, by = "participant_id")

# Load additional datasets (adjust paths as needed)
sleep2 <- read.csv("~/Downloads/sleep_participant_summary.csv")
steps2 <- read.csv("~/Downloads/steps_participant_summary.csv")
diet2 <- read.csv("~/Downloads/diet_data.csv") %>%
  mutate(across(everything(), ~na_if(., 777)))
hrv2 <- read.csv("~/Downloads/HRV_Joe.csv") %>%
  select(hr_from_ecg, hrv_rmssdc_hr_merged, hrv_rmssd_outlier, ecg_snr, hrv_rmssd, subject_id)
all_stress2 <- read.csv("~/Downloads/stress_overall_daily.csv")
survey2 <- read.csv("~/Downloads/survey_data.csv") %>%
  select(
    person_id,
    dmlpor..When.reflecting.on.your.typical.eating.ha,
    dmlfrveg..In.a.typical.day..how.many.servings.of.,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlact..What.would.you.consider.your.typical.acti
  ) %>%
  mutate(across(everything(), ~na_if(., 777)))

# Merge all datasets
df_s2 <- df_s2 %>%
  left_join(sleep2, by = "participant_id") %>%
  left_join(steps2, by = "participant_id") %>%
  left_join(diet2, by = c("participant_id" = "person_id")) %>%
  left_join(hrv2, by = c("participant_id" = "subject_id")) %>%
  left_join(all_stress2, by = c("participant_id" = "id")) %>%
  left_join(survey2, by = c("participant_id" = "person_id"))

##prob
df_s2 = df_s2 %>% dplyr::group_by(participant_id) %>% slice(1) %>% ungroup()
#56

### ======================================================================
### RECODE VARIABLES
### ======================================================================

# Insurance
df_s2$pxhic1_cat <- factor(
  ifelse(df_s2$pxhic1 == 1, "yes",
  ifelse(df_s2$pxhic1 == 0, "no", NA)),
  levels = c("no", "yes")
)

# Alcohol variables
df_s2$suwndosfr_cat <- factor(
  ifelse(df_s2$suwndosfr == 1, "yes",
  ifelse(df_s2$suwndosfr == 0, "no", NA)),
  levels = c("no", "yes")
)

df_s2$suwnkncf_cat <- factor(
  ifelse(df_s2$suwnkncf == 1, "yes",
  ifelse(df_s2$suwnkncf == 0, "no", NA)),
  levels = c("no", "yes")
)

df_s2$sulqkncf_cat <- factor(
  ifelse(df_s2$sulqkncf == 1, "yes",
  ifelse(df_s2$sulqkncf == 0, "no",
  ifelse(df_s2$sulqkncf == 777, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

# Family history
df_s2$fh_dm2pt_cat <- factor(
  ifelse(df_s2$fh_dm2pt == 1, "yes",
  ifelse(df_s2$fh_dm2pt == 0, "no",
  ifelse(df_s2$fh_dm2pt == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_s2$fh_dm2sb_cat <- factor(
  ifelse(df_s2$fh_dm2sb == 1, "yes",
  ifelse(df_s2$fh_dm2sb == 0, "no",
  ifelse(df_s2$fh_dm2sb == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_s2$fh_dm2_combined <- ifelse(
  df_s2$fh_dm2pt == 555 | df_s2$fh_dm2sb == 555, "prefer not answer",
  ifelse(df_s2$fh_dm2pt == 1 | df_s2$fh_dm2sb == 1, "yes",
  ifelse(df_s2$fh_dm2pt == 0 & df_s2$fh_dm2sb == 0, "no", NA))
)

df_s2$fh_dm2_combined <- factor(
  df_s2$fh_dm2_combined,
  levels = c("no", "yes", "prefer not answer")
)

### ======================================================================
### DEFINE SHORT NAMES
### ======================================================================

short_names2 <- c(
  "pxhic1_cat" = "Insurance",
  "sualcage" = "Total years drinking",
  "suwndosfr" = "Wine (weekly)",
  "suwnkncf" = "Wine in past year",
  "sulqkncf" = "Liquor in past year",
  "sulqdosfr" = "Liquor (weekly)",
  "subrkncf" = "Beer/Ale in past year",
  "subrdosfr" = "Beer/Ale (weekly)",
  "fh_dm2_combined" = "Family T2D history (any)",
  "fh_dm2pt_cat" = "Parent T2D history",
  "fh_dm2sb_cat" = "Sibling T2D history",
  "years_of_education" = "Years of education"
)

### ======================================================================
### SELECT VARIABLES FOR DEMOGRAPHIC TABLE
### ======================================================================

vars_demo2 <- c(
  "pxhic1_cat",
  "fh_dm2_combined",
  "fh_dm2pt_cat",
  "fh_dm2sb_cat",
  "years_of_education"
)

short_demo2 <- short_names2[names(short_names2) %in% vars_demo2]

### ======================================================================
### CREATE SUMMARY TABLE FUNCTION (APA STYLE)
### ======================================================================

create_summary_table2 <- function(var_list, short_names, df) {
  num_vars <- var_list[sapply(df[var_list], is.numeric)]
  cat_vars <- var_list[sapply(df[var_list], function(x) !is.numeric(x))]
  
  num_summary <- lapply(num_vars, function(var) {
    data <- df %>% filter(!is.na(.data[[var]]))
    IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
    NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
    Total_vals <- data[[var]]
    
    t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
    p_raw <- if (!is.null(t_res)) t_res$p.value else NA
    p_val <- if (is.na(p_raw)) "–"
    else if (p_raw < 0.001) "<0.001"
    else formatC(p_raw, digits = 3, format = "f")
    
    tibble(
      Variable = short_names[[var]],
      IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
      Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
      Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  cat_summary <- lapply(cat_vars, function(var) {
    tbl <- table(df[[var]], df$IR_label)
    IR_total <- sum(tbl[, "IR"])
    NonIR_total <- sum(tbl[, "Non-IR"])
    total_total <- sum(tbl)
    
    chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
    p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
    p_val <- if (is.na(p_raw)) "–"
    else if (p_raw < 0.001) "<0.001"
    else formatC(p_raw, digits = 3, format = "f")
    
    # Special case for clinical_site
    if (var == "clinical_site") {
      IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["IR"], 100*x["IR"]/total_total))
      NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["Non-IR"], 100*x["Non-IR"]/total_total))
      Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
    } else {
      IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["IR"], 100*x["IR"]/IR_total))
      NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["Non-IR"], 100*x["Non-IR"]/NonIR_total))
      Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
    }
    
    tibble(
      Variable = short_names[[var]],
      IR = paste(IR_str, collapse = "; "),
      Non_IR = paste(NonIR_str, collapse = "; "),
      Total = paste(Total_str, collapse = "; "),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  final_table <- bind_rows(num_summary, cat_summary)
  
  missing_summary <- sapply(var_list, function(v) {
    n_miss <- sum(is.na(df[[v]]))
    pct <- round(100*n_miss/nrow(df),1)
    sprintf("%d (%.1f%%)", n_miss, pct)
  })
  final_table$`Missing (n, %)` <- missing_summary
  
  note_row <- tibble(
    Variable = "Note:",
    IR = "IR = mean ± SD or count (%)",
    Non_IR = "Non-IR = mean ± SD or count (%)",
    Total = "Total = mean ± SD or count (%)",
    `p-value` = "t-test or χ² test",
    `Missing (n, %)` = ""
  )
  final_table <- bind_rows(note_row, final_table)
  
  final_table$`p-value` <- sapply(final_table$`p-value`, add_stars)
  final_table
}

### ======================================================================
### GENERATE STUDY 2 DEMOGRAPHIC TABLE
### ======================================================================

table_demo2 <- create_summary_table2(vars_demo2, short_demo2, df_s2)

# Identify significant rows
p_char <- table_demo2$`p-value`[-1] # skip Note row
p_numeric <- suppressWarnings(as.numeric(gsub("[*<]", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1

# Render APA-style table
# Remove NULLs in case any plots failed
plot_list_demo2 <- lapply(
  vars_demo2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_demo2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_demo2 <- plot_list_demo2[!sapply(plot_list_demo2, is.null)]

# Combine into a single panel
big_panel_demo2 <- wrap_plots(plot_list_demo2, ncol = 5)
big_panel_demo2

kbl(table_demo2, "html",
    caption = "Study 2. Demographic Variables by IR Status (Study 2)",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows, background = "#F5FBFF")

Study 2. Demographic Variables by IR Status (Study 2)
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test
Years of education	15.54 ± 2.92	16.60 ± 2.51	16.11 ± 2.73	0.154	13 (23.2%)
Insurance	10 (47.6%); 11 (52.4%)	6 (27.3%); 16 (72.7%)	16 (37.2%); 27 (62.8%)	0.287	0 (0.0%)
Family T2D history (any)	10 (38.5%); 12 (46.2%); 4 (15.4%)	15 (50.0%); 9 (30.0%); 6 (20.0%)	25 (44.6%); 21 (37.5%); 10 (17.9%)	0.461	1 (1.8%)
Parent T2D history	12 (48.0%); 10 (40.0%); 3 (12.0%)	17 (56.7%); 9 (30.0%); 4 (13.3%)	29 (52.7%); 19 (34.5%); 7 (12.7%)	0.738	0 (0.0%)
Sibling T2D history	15 (57.7%); 9 (34.6%); 2 (7.7%)	20 (66.7%); 6 (20.0%); 4 (13.3%)	35 (62.5%); 15 (26.8%); 6 (10.7%)	0.427	0 (0.0%)

3.3 Alcohol Use

3.3.1 Study 1

### ======================================================================
### GENERATE ALCOHOL TABLE
### ======================================================================
### ======================================================================
### STUDY 2: PLOT PANELS
### ======================================================================
plot_list_alcohol <- lapply(
  vars_alcohol,
  plot_ir_variable,
  data = df_test,
  short_names = short_alcohol,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_alcohol <- plot_list_alcohol[!sapply(plot_list_alcohol, is.null)]
big_panel_alcohol <- wrap_plots(plot_list_alcohol, ncol = 4)
big_panel_alcohol

table_alcohol <- create_summary_table(vars_alcohol, short_alcohol, df_test)


kbl(table_alcohol, "html",
    caption = "Study 1. Alcohol Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(sig_rows, background = "#F5FBFF")

Study 1. Alcohol Variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test
Total years drinking	19.76 ± 7.22	18.70 ± 3.29	19.20 ± 5.48	0.344	31 (22.5%)
Wine (weekly)	0.90 ± 1.40	2.97 ± 4.68	2.09 ± 3.78	0.012 *	70 (50.7%)
Wine in past year	0.58 ± 0.50	0.68 ± 0.47	0.64 ± 0.48	0.270	31 (22.5%)
Liquor in past year	0.70 ± 0.46	0.60 ± 0.49	0.64 ± 0.48	0.266	31 (22.5%)
Liquor (weekly)	1.29 ± 2.64	2.18 ± 3.24	1.72 ± 2.96	0.218	70 (50.7%)
Beer/Ale in past year	0.50 ± 0.51	0.63 ± 0.49	0.57 ± 0.50	0.174	31 (22.5%)
Beer/Ale (weekly)	1.08 ± 2.55	3.14 ± 6.78	2.30 ± 5.52	0.103	77 (55.8%)

3.3.2 Study 2

vars_alcohol2 <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

short_demo2 <- short_names2[names(short_names2) %in% vars_demo2]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]
# -------------------------------------
# Alcohol plots
# -------------------------------------
plot_list_alcohol2 <- lapply(
  vars_alcohol2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_alcohol2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_alcohol2 <- plot_list_alcohol2[!sapply(plot_list_alcohol2, is.null)]
big_panel_alcohol2 <- wrap_plots(plot_list_alcohol2, ncol = 4)
big_panel_alcohol2

### ======================================================================
### STUDY 2 — SAFE ALCOHOL VARIABLE LIST
### ======================================================================

vars_alcohol2 <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

# Keep ONLY variables that exist in df_s2
vars_alcohol2 <- vars_alcohol2[vars_alcohol2 %in% names(df_s2)]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]


### ======================================================================
### STUDY 2 — ALCOHOL PLOTS
### ======================================================================

plot_list_alcohol2 <- lapply(
  vars_alcohol2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_alcohol2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)


### ======================================================================
### STUDY 2 ALCOHOL TABLE  — FIXED
### ======================================================================
vars_alcohol2 <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

# Keep ONLY variables that exist in df_s2
vars_alcohol2 <- vars_alcohol2[vars_alcohol2 %in% names(df_s2)]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]


### ======================================================================
### STUDY 2 — ALCOHOL PLOTS

### ======================================================================
### STUDY 2 ALCOHOL TABLE  — FIXED
### ======================================================================

table_alcohol2 <- create_summary_table(vars_alcohol2, short_alcohol2, df_s2)

kbl(table_alcohol2, "html",          # <-- FIXED: now using table_alcohol2
    caption = "Study 2. Alcohol Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center")  %>%
  row_spec(sig_rows, background = "#F5FBFF")

Study 2. Alcohol Variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test
Total years drinking	19.32 ± 5.76	19.08 ± 3.98	19.20 ± 4.90	0.865	6 (10.7%)
Wine (weekly)	1.42 ± 2.15	1.12 ± 1.50	1.25 ± 1.78	0.692	28 (50.0%)
Wine in past year	0.48 ± 0.51	0.64 ± 0.49	0.56 ± 0.50	0.264	6 (10.7%)
Liquor in past year	0.60 ± 0.50	0.56 ± 0.51	0.58 ± 0.50	0.780	6 (10.7%)
Liquor (weekly)	1.80 ± 3.86	0.71 ± 1.33	1.28 ± 2.93	0.319	27 (48.2%)
Beer/Ale in past year	0.56 ± 0.51	0.68 ± 0.48	0.62 ± 0.49	0.392	6 (10.7%)
Beer/Ale (weekly)	1.71 ± 3.02	0.88 ± 1.96	1.26 ± 2.49	0.385	25 (44.6%)

3.4 Physical Activity

3.4.1 Study 1

# other_vars <- read.csv("~/all_vars.csv") %>% 
#   select(fh_dm2sb, years_of_education, sualcage, sualckncf, sualcdrv, pxhic1)  %>%
#   mutate(across(everything(), ~na_if(., 777)))
# 
# other_vars$participant_id = other_vars$person_id
# #length(other_vars$participant_id)
# 
# df_f = merge(other_vars, df_final,by= "person_id", by.y="participant_id")
# #length(df_f$participant_id) #138
# sleep <- read.csv("~/Downloads/sleep_participant_summary.csv")
# steps <- read.csv("~/Downloads/steps_participant_summary.csv")
# diet <- read.csv("~/Downloads/diet_data.csv") %>%
#   mutate(across(everything(), ~na_if(., 777)))
# hrv  <- read.csv("~/Downloads/HRV_Joe.csv") %>%
#   dplyr::select(hr_from_ecg, hrv_rmssdc_hr_merged, hrv_rmssd_outlier, ecg_snr, hrv_rmssd, subject_id)
# all_stress <- read.csv("~/Downloads/stress_overall_daily.csv")
# survey = read.csv("~/Downloads/survey_data.csv") %>%
#   # Select the columns you want
#   dplyr::select(
#     person_id,
#     dmlpor..When.reflecting.on.your.typical.eating.ha,
#     dmlfrveg..In.a.typical.day..how.many.servings.of.,
#     dmlsugar..When.reflecting.on.your.typical.eating.,
#     dmlvex..In.a.typical.week..how.often.do.you.engag,
#     dmlgrain..In.a.typical.day..how.often.do.you.cons,
#     dmlhex..How.often.would.you.say.that.you.engage.i,
#     dmlact..What.would.you.consider.your.typical.acti,
#   ) %>%
#   # Replace 777 with NA
#   dplyr::mutate(across(everything(), ~na_if(., 777)))
# 
# #cgm <- read.csv("~/Downloads/cgm_cleaned_daily.csv")
# 
# df_test <- df_final %>%
#   left_join(other_vars, by = "participant_id") %>%
#   left_join(sleep, by = "participant_id") %>%
#   left_join(steps, by = "participant_id") %>%
#   left_join(diet, by = c("participant_id" = "person_id")) %>%
#   left_join(hrv, by = c("participant_id" = "subject_id")) %>%
#   left_join(all_stress, by = c("participant_id" = "id")) %>% 
#   left_join(survey, by = c("participant_id" = "person_id"))
# 
# # Keep only one row per participant
# df_test <- df_test %>%
#   group_by(participant_id) %>%
#   slice(1) %>%   # keep the first row per participant
#   ungroup()

# Check that each participant appears only once
#table(duplicated(df_test$participant_id))  # should all be FALSE 112

Study 1. Physical Activity by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Vig Exercise Frequency	3.71 ± 4.04	5.66 ± 4.19	4.78 ± 4.22	0.006 **	0 (0%)
Typical Activity Level	5.81 ± 3.75	7.43 ± 3.51	6.70 ± 3.70	0.010 *	0 (0%)
Mean Total Steps	7357.54 ± 4118.51	7549.96 ± 4239.77	7463.51 ± 4171.62	0.788	0 (0%)
SD Total Steps	6239.10 ± 2556.69	6669.63 ± 3222.56	6476.20 ± 2939.70	0.383	0 (0%)
CV Total Steps	1.00 ± 0.59	0.97 ± 0.32	0.98 ± 0.46	0.708	0 (0%)
Mean Active Minutes	182.29 ± 107.27	184.60 ± 87.17	183.56 ± 96.36	0.892	0 (0%)
Mean Activity Onset	2.91 ± 1.13	2.90 ± 1.08	2.90 ± 1.10	0.941	0 (0%)
SD Activity Onset	4.07 ± 1.16	4.04 ± 0.97	4.05 ± 1.05	0.863	0 (0%)
Pct Steps Morning	16.09 ± 7.48	16.54 ± 7.35	16.33 ± 7.38	0.723	0 (0%)
Pct Steps Afternoon	24.82 ± 9.26	22.46 ± 6.78	23.52 ± 8.05	0.097	0 (0%)
Pct Steps Evening	18.86 ± 7.03	19.02 ± 7.21	18.95 ± 7.10	0.897	0 (0%)
Activity Midpoint	11.24 ± 1.16	11.10 ± 1.12	11.16 ± 1.14	0.484	0 (0%)
Mean Steps Weekend	7500.16 ± 4628.36	6984.66 ± 4292.10	7216.26 ± 4437.28	0.503	0 (0%)
Mean Steps Weekday	7284.68 ± 4238.04	7759.95 ± 4874.57	7546.42 ± 4589.29	0.541	0 (0%)

3.4.2 Study 2

### ======================================================================
### STUDY 2 — PHYSICAL ACTIVITY PLOTS + TABLE 
### Using df_ucsd_final
### ======================================================================

# ---------------------------------------------------------
# VARIABLES TO PLOT (must match Study 1 structure exactly)
# ---------------------------------------------------------
vars_to_plot_s2 <- c(
  "dmlhex..How.often.would.you.say.that.you.engage.",
  "dmlvex..In.a.typical.week..how.often.do.you.engag",
  "dmlact..What.would.you.consider.your.typical.acti",
  "mean_total_steps",
  "sd_total_steps",
  "cv_total_steps",
  "mean_active_minutes",
  "mean_activity_onset_h",
  "sd_activity_onset_h",
  "mean_pct_steps_morning",
  "mean_pct_steps_afternoon",
  "mean_pct_steps_evening",
  "mean_activity_midpoint_h",
  "mean_steps_weekend",
  "mean_steps_weekday"
)

# Only keep variables that actually exist in df_s2
vars_to_plot_s2 <- vars_to_plot_s2[vars_to_plot_s2 %in% names(df_s2)]

# ---------------------------------------------------------
# SHORT NAMES — FILTER + ENSURE CHARACTER VECTOR
# ---------------------------------------------------------
short_names_s2 <- c(
  "dmlhex..How.often.would.you.say.that.you.engage." = "Home Exercise Freq.",
  "dmlvex..In.a.typical.week..how.often.do.you.engag" = "Vig. Exercise Freq.",
  "dmlact..What.would.you.consider.your.typical.acti" = "Typical Activity",
  "mean_total_steps" = "Mean Total Steps",
  "sd_total_steps" = "SD Total Steps",
  "cv_total_steps" = "CV Total Steps",
  "mean_active_minutes" = "Mean Active Min",
  "mean_activity_onset_h" = "Mean Activity Onset",
  "sd_activity_onset_h" = "SD Activity Onset",
  "mean_pct_steps_morning" = "Pct Steps Morning",
  "mean_pct_steps_afternoon" = "Pct Steps Afternoon",
  "mean_pct_steps_evening" = "Pct Steps Evening",
  "mean_activity_midpoint_h" = "Activity Midpoint",
  "mean_steps_weekend" = "Mean Steps Wknd",
  "mean_steps_weekday" = "Mean Steps Wkday"
)

# Filter + force character vector (fix for earlier Study 2 table error)
short_names_s2 <- unlist(short_names_s2[names(short_names_s2) %in% vars_to_plot_s2])

# ---------------------------------------------------------
# PLOTS (Study 2)
# ---------------------------------------------------------
plot_list_s2 <- lapply(
  vars_to_plot_s2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_names_s2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_s2 <- plot_list_s2[!sapply(plot_list_s2, is.null)]
big_panel_s2 <- wrap_plots(plot_list_s2, ncol = 5)
big_panel_s2

### ======================================================================
### STUDY 2 — SUMMARY TABLE
### ======================================================================

num_vars_s2 <- vars_to_plot_s2[sapply(df_s2[vars_to_plot_s2], is.numeric)]
cat_vars_s2 <- vars_to_plot_s2[sapply(df_s2[vars_to_plot_s2], function(x) !is.numeric(x))]

# ---------------------------------------------------------
# Numeric variables summary
# ---------------------------------------------------------
num_summary_s2 <- lapply(num_vars_s2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names_s2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ---------------------------------------------------------
# Categorical variables summary
# ---------------------------------------------------------
cat_summary_s2 <- lapply(cat_vars_s2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names_s2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ---------------------------------------------------------
# Combine tables
# ---------------------------------------------------------
final_table_s2 <- bind_rows(num_summary_s2, cat_summary_s2)

final_table_s2$`p-value` <- sapply(final_table_s2$`p-value`, add_stars)

description_row_s2 <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_table_s2 <- bind_rows(description_row_s2, final_table_s2)
final_table_s2$`p-value` <- sapply(final_table_s2$`p-value`, add_stars)

# ---------------------------------------------------------
# Missingness per variable
# ---------------------------------------------------------
missing_summary_s2 <- sapply(vars_to_plot_s2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col_s2 <- c("Number of missing values per variable", missing_summary_s2)
final_table_s2$`Missing (n, %)` <- missing_col_s2

# ---------------------------------------------------------
# Significant rows
# ---------------------------------------------------------
p_char_s2 <- final_table_s2$`p-value`
p_numeric_s2 <- as.numeric(gsub("<", "", p_char_s2))
sig_rows_s2 <- which(!is.na(p_numeric_s2) & p_numeric_s2 < 0.05) + 1

# ---------------------------------------------------------
# Render
# ---------------------------------------------------------
kbl(final_table_s2, "html", 
    caption = "Study 2. Physical Activity Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows_s2, background = "#F5FBFF")

Study 2. Physical Activity Variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Vig. Exercise Freq.	5.00 ± 4.24	6.17 ± 4.49	5.62 ± 4.38	0.322	0 (0%)
Typical Activity	6.35 ± 3.89	8.17 ± 3.34	7.32 ± 3.69	0.068	0 (0%)
Mean Total Steps	5199.23 ± 4832.78	5652.44 ± 2697.07	5442.02 ± 3808.38	0.674	0 (0%)
SD Total Steps	5619.60 ± 3251.97	4783.17 ± 1750.44	5170.41 ± 2566.08	0.258	2 (3.6%)
CV Total Steps	1.24 ± 0.55	0.97 ± 0.44	1.10 ± 0.51	0.056	2 (3.6%)
Mean Active Min	135.38 ± 87.68	170.73 ± 78.37	154.32 ± 83.96	0.120	0 (0%)
Mean Activity Onset	3.14 ± 1.41	3.85 ± 1.80	3.53 ± 1.66	0.106	1 (1.8%)
SD Activity Onset	4.79 ± 1.92	4.52 ± 1.33	4.64 ± 1.62	0.556	2 (3.6%)
Pct Steps Morning	16.02 ± 10.02	19.47 ± 9.48	17.87 ± 9.80	0.193	0 (0%)
Pct Steps Afternoon	19.23 ± 11.03	23.18 ± 7.84	21.35 ± 9.57	0.135	0 (0%)
Pct Steps Evening	14.51 ± 7.04	19.77 ± 10.30	17.33 ± 9.25	0.028 *	0 (0%)
Activity Midpoint	10.94 ± 1.10	11.28 ± 1.28	11.13 ± 1.20	0.295	1 (1.8%)
Mean Steps Wknd	6190.92 ± 6284.24	6859.84 ± 4162.77	6562.54 ± 5170.32	0.656	2 (3.6%)
Mean Steps Wkday	5059.24 ± 4699.78	5228.52 ± 2367.79	5148.50 ± 3624.94	0.869	1 (1.8%)

3.5 Sleep

3.5.1 Study 1

vars_to_plot2 <- c(
  "cpd_h",
  "sd_onset_clock_h",
  "sd_midpoint_clock_h",
  "mean_wake_clock_h",
  "social_jetlag_h",
  "sri",
  "sd_wake_clock_h",
  "mean_midpoint_weekday_h",
  "mean_onset_clock_h",
  "mean_midpoint_clock_h",
  "mean_midpoint_weekend_h"
)

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]

short_names2 <- c(
  "cpd_h" = "Sleep Chronotype",
  "sd_onset_clock_h" = "SD Sleep Onset",
  "sd_midpoint_clock_h" = "Sleep Midpoint SD",
  "mean_wake_clock_h" = "Mean Wake Time",
  "social_jetlag_h" = "Social Jetlag",
  "sri" = "Sleep Reg. Index",
  "sd_wake_clock_h" = "SD Wake Time",
  "mean_midpoint_weekday_h" = "Mean Midpoint Wkday",
  "mean_onset_clock_h" = "Sleep Onset",
  "mean_midpoint_clock_h" = "M Sleep Midpoint",
  "mean_midpoint_weekend_h" = "M Midpoint Wknd"
)

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     
  data = df_test,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]

big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_test %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_test[[var]], df_test$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2 <- final_tableB2$`p-value`
p_numeric2 <- as.numeric(gsub("<", "", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_test[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_test), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)

# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 1. Sleep variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")

Study 1. Sleep variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Sleep Chronotype	1.48 ± 1.22	1.22 ± 0.98	1.34 ± 1.10	0.173	5 (3.6%)
SD Sleep Onset	1.88 ± 1.22	1.68 ± 1.25	1.77 ± 1.24	0.348	5 (3.6%)
Sleep Midpoint SD	2.01 ± 1.58	1.63 ± 1.28	1.80 ± 1.43	0.136	5 (3.6%)
Mean Wake Time	30.16 ± 2.14	30.20 ± 2.09	30.18 ± 2.11	0.893	0 (0%)
Social Jetlag	1.48 ± 1.56	1.17 ± 1.39	1.31 ± 1.47	0.244	7 (5.1%)
Sleep Reg. Index	29.21 ± 23.98	36.35 ± 24.88	33.13 ± 24.65	0.098	7 (5.1%)
SD Wake Time	3.21 ± 3.05	2.65 ± 2.72	2.90 ± 2.87	0.278	5 (3.6%)
Mean Midpoint Wkday	27.00 ± 1.56	26.83 ± 1.22	26.90 ± 1.37	0.500	7 (5.1%)
Sleep Onset	23.64 ± 1.53	23.38 ± 1.48	23.49 ± 1.50	0.317	0 (0%)
M Sleep Midpoint	26.90 ± 1.38	26.79 ± 1.25	26.84 ± 1.31	0.643	0 (0%)
M Midpoint Wknd	26.54 ± 2.26	26.58 ± 2.05	26.56 ± 2.14	0.925	7 (5.1%)

3.5.2 Study 2

# ----------------------------
# STUDY 2 
# ----------------------------
vars_to_plot2_s2 <- c(
  "cpd_h",
  "sd_onset_clock_h",
  "sd_midpoint_clock_h",
  "mean_wake_clock_h",
  "social_jetlag_h",
  "sri",
  "sd_wake_clock_h",
  "mean_midpoint_weekday_h",
  "mean_onset_clock_h",
  "mean_midpoint_clock_h",
  "mean_midpoint_weekend_h"
)

# Keep only the variables that exist
vars_to_plot2_s2 <- vars_to_plot2_s2[vars_to_plot2_s2 %in% names(df_s2)]

short_names2_s2 <- c(
  "cpd_h" = "Sleep Chronotype",
  "sd_onset_clock_h" = "SD Sleep Onset",
  "sd_midpoint_clock_h" = "Sleep Midpoint SD",
  "mean_wake_clock_h" = "Mean Wake Time",
  "social_jetlag_h" = "Social Jetlag",
  "sri" = "Sleep Reg. Index",
  "sd_wake_clock_h" = "SD WakeTime",
  "mean_midpoint_weekday_h" = "M Midpoint Weekday",
  "mean_onset_clock_h" = "Sleep Onset",
  "mean_midpoint_clock_h" = "M Sleep Midpoint",
  "mean_midpoint_weekend_h" = "M Midpoint Weekend"
)

short_names2_s2 <- short_names2_s2[names(short_names2_s2) %in% vars_to_plot2_s2]

# ----------------------------
# PLOTS FOR STUDY 2
# ----------------------------

plot_list2_s2 <- lapply(
  vars_to_plot2_s2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_names2_s2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2_s2 <- plot_list2_s2[!sapply(plot_list2_s2, is.null)]

big_panel2_s2 <- wrap_plots(plot_list2_s2, ncol = 4)
big_panel2_s2

# ----------------------------
# SUMMARY TABLE FOR STUDY 2
# ----------------------------

num_vars2_s2 <- vars_to_plot2_s2[sapply(df_s2[vars_to_plot2_s2], is.numeric)]
cat_vars2_s2 <- vars_to_plot2_s2[sapply(df_s2[vars_to_plot2_s2], function(x) !is.numeric(x))]

num_summary2_s2 <- lapply(num_vars2_s2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2_s2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()


cat_summary2_s2 <- lapply(cat_vars2_s2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2_s2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()


final_tableB2_s2 <- bind_rows(num_summary2_s2, cat_summary2_s2)

final_tableB2_s2$`p-value` <- sapply(final_tableB2_s2$`p-value`, add_stars)

# Add note row
description_rowB_s2 <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2_s2 <- bind_rows(description_rowB_s2, final_tableB2_s2)
final_tableB2_s2$`p-value` <- sapply(final_tableB2_s2$`p-value`, add_stars)

# Identify significant rows
p_char2_s2 <- final_tableB2_s2$`p-value`
p_numeric2_s2 <- as.numeric(gsub("<", "", p_char2_s2))
sig_rows2_s2 <- which(!is.na(p_numeric2_s2) & p_numeric2_s2 < 0.05) + 1

# ----------------------------
# Missingness per variable
# ----------------------------
missing_summary_s2 <- sapply(vars_to_plot2_s2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col_s2 <- c("Number of missing values per variable", missing_summary_s2)
final_tableB2_s2$`Missing (n, %)` <- missing_col_s2

# ----------------------------
# Render Table for Study 2
# ----------------------------
kbl(final_tableB2_s2, "html",
    caption = "Study 2. Sleep Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2_s2, background = "#F5FBFF")

Study 2. Sleep Variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Sleep Chronotype	1.09 ± 0.57	1.08 ± 1.04	1.09 ± 0.86	0.954	5 (8.9%)
SD Sleep Onset	1.66 ± 0.99	1.40 ± 0.96	1.51 ± 0.97	0.351	5 (8.9%)
Sleep Midpoint SD	1.48 ± 0.81	1.51 ± 1.41	1.49 ± 1.18	0.936	5 (8.9%)
Mean Wake Time	30.84 ± 1.19	30.43 ± 1.36	30.62 ± 1.29	0.245	2 (3.6%)
Social Jetlag	0.77 ± 0.61	1.06 ± 1.12	0.94 ± 0.94	0.238	6 (10.7%)
Sleep Reg. Index	30.87 ± 26.56	37.15 ± 21.35	34.33 ± 23.78	0.374	7 (12.5%)
SD WakeTime	1.90 ± 1.04	2.23 ± 2.86	2.09 ± 2.25	0.565	5 (8.9%)
M Midpoint Weekday	26.99 ± 1.35	27.00 ± 0.86	26.99 ± 1.08	0.980	6 (10.7%)
Sleep Onset	23.48 ± 1.85	23.35 ± 1.20	23.41 ± 1.52	0.759	2 (3.6%)
M Sleep Midpoint	27.16 ± 1.32	26.89 ± 0.73	27.01 ± 1.04	0.368	2 (3.6%)
M Midpoint Weekend	27.38 ± 1.62	26.89 ± 1.24	27.10 ± 1.42	0.256	6 (10.7%)

3.6 Diet

3.6.1 Study 1

# -----------------------------
# Step 0: Define diet variables
# -----------------------------
diet_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",   # Fast food
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",  # Fruit
  "diet3..How.many.servings.of.vegetables.did.you.ea",  # Vegetables
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  # Soda
  "diet5..How.many.times.a.week.did.you.eat.beans..l",  # Beans/Chicken/Fish
  "diet6..How.many.times.a.week.did.you.eat.regular.",  # Salty Snacks
  "diet7..How.many.times.a.week.did.you.eat.desserts",  # Desserts
  "diet8..How.much.margarine..butter..or.meat.fat.do",  # Butter & Meat
  "diet9..How.many.servings.of.fruit.juice.did.you.d"   # Fruit Juice
)

# -----------------------------
# Step 1: Identify unhealthy items
# -----------------------------
unhealthy_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",  
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  
  "diet6..How.many.times.a.week.did.you.eat.regular.",  
  "diet7..How.many.times.a.week.did.you.eat.desserts",  
  "diet8..How.much.margarine..butter..or.meat.fat.do",  
  "diet9..How.many.servings.of.fruit.juice.did.you.d"
)

healthy_vars <- setdiff(diet_vars, unhealthy_vars)

# -----------------------------
# Step 2: Reverse code unhealthy items (0,1,2 → 2,1,0)
# -----------------------------
df_test[unhealthy_vars] <- 2 - df_test[unhealthy_vars]

# -----------------------------
# Step 3: Compute total Diet Score
# -----------------------------
df_test$diet_score <- rowSums(df_test[diet_vars], na.rm = TRUE)

df_test$diet_unhealthy_score <- rowSums(df_test[unhealthy_vars], na.rm = TRUE)

# Optional: scale 0–100
max_score <- length(diet_vars) * 2
df_test$diet_score_0_100 <- df_test$diet_score / max_score * 100

# -----------------------------
# Step 4: Check results
# -----------------------------
#head(df_test[, c(diet_vars, "diet_score", "diet_score_0_100")])

vars_to_plot2 <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",
  "diet3..How.many.servings.of.vegetables.did.you.ea",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",
  "diet_score_0_100",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",
  "diet5..How.many.times.a.week.did.you.eat.beans..l",
  "diet7..How.many.times.a.week.did.you.eat.desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do",
  "diet9..How.many.servings.of.fruit.juice.did.you.d",
  "diet6..How.many.times.a.week.did.you.eat.regular.",
  "dmlpor..When.reflecting.on.your.typical.eating.ha",
  "dmlfrveg..In.a.typical.day..how.many.servings.of.",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons",
  "dmlsugar..When.reflecting.on.your.typical.eating.")

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]

short_names2 <- c(
  "diet_score_0_100" = "Overall Diet Score",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
  "diet3..How.many.servings.of.vegetables.did.you.ea" = "Weekly Vegetables",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac" = "Weekly Fruit",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet" = "Weekly Soda",
  "diet5..How.many.times.a.week.did.you.eat.beans..l"= "Weekly Beans",
  "diet7..How.many.times.a.week.did.you.eat.desserts" =  "Weekly Desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do" =  "Weekly Butter & Meat",
  "diet9..How.many.servings.of.fruit.juice.did.you.d" =  "Weekly Fruit juice",
  "diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
  "dmlpor..When.reflecting.on.your.typical.eating.ha" = "Typical Portion",
  "dmlfrveg..In.a.typical.day..how.many.servings.of." = "Fruit & Veg Intake",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
  "dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake")

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     # <- same function you already created
  data = df_test,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]

big_panel2 <- wrap_plots(plot_list2, ncol = 6)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_test %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_test[[var]], df_test$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_test[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_test), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)

# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 1. Diet variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")

Study 1. Diet variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Weekly Fastfood	1.22 ± 0.75	1.30 ± 0.67	1.27 ± 0.70	0.551	10 (7.2%)
Weekly Vegetables	1.47 ± 0.71	1.40 ± 0.65	1.43 ± 0.67	0.588	10 (7.2%)
Weekly Fruit	1.74 ± 0.52	1.57 ± 0.61	1.64 ± 0.57	0.089	12 (8.7%)
Overall Diet Score	66.04 ± 21.10	60.82 ± 22.44	63.16 ± 21.93	0.162	0 (0%)
Weekly Soda	1.62 ± 0.64	1.66 ± 0.63	1.64 ± 0.64	0.749	10 (7.2%)
Weekly Beans	0.66 ± 0.74	0.51 ± 0.65	0.58 ± 0.69	0.260	10 (7.2%)
Weekly Desserts	1.45 ± 0.65	1.21 ± 0.72	1.32 ± 0.70	0.056	10 (7.2%)
Weekly Butter & Meat	1.21 ± 0.59	1.26 ± 0.61	1.23 ± 0.60	0.635	10 (7.2%)
Weekly Fruit juice	1.83 ± 0.46	1.75 ± 0.47	1.79 ± 0.47	0.374	11 (8%)
Regular Salty Snacks per Week	1.55 ± 0.63	1.27 ± 0.70	1.40 ± 0.68	0.018 *	10 (7.2%)
Typical Portion	6.67 ± 3.01	6.64 ± 2.87	6.65 ± 2.92	0.966	2 (1.4%)
Fruit & Veg Intake	5.40 ± 3.04	5.72 ± 3.02	5.58 ± 3.02	0.538	0 (0%)
Typical Grain Intake	5.65 ± 4.20	5.53 ± 3.88	5.58 ± 4.01	0.864	0 (0%)
Typical Sugar Intake	4.44 ± 3.15	4.41 ± 3.36	4.42 ± 3.26	0.960	0 (0%)

3.6.2 Study 2

# -----------------------------
# Step 0: Define diet variables
# -----------------------------
diet_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",   # Fast food
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",  # Fruit
  "diet3..How.many.servings.of.vegetables.did.you.ea",  # Vegetables
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  # Soda
  "diet5..How.many.times.a.week.did.you.eat.beans..l",  # Beans/Chicken/Fish
  "diet6..How.many.times.a.week.did.you.eat.regular.",  # Salty Snacks
  "diet7..How.many.times.a.week.did.you.eat.desserts",  # Desserts
  "diet8..How.much.margarine..butter..or.meat.fat.do",  # Butter & Meat
  "diet9..How.many.servings.of.fruit.juice.did.you.d")   # Fruit Juice )

# -----------------------------
# Step 1: Identify unhealthy items
# -----------------------------
unhealthy_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",  
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  
  "diet6..How.many.times.a.week.did.you.eat.regular.",  
  "diet7..How.many.times.a.week.did.you.eat.desserts",  
  "diet8..How.much.margarine..butter..or.meat.fat.do",  
  "diet9..How.many.servings.of.fruit.juice.did.you.d")

healthy_vars <- setdiff(diet_vars, unhealthy_vars)

# -----------------------------
# Step 2: Reverse code unhealthy items (0,1,2 → 2,1,0)
# -----------------------------
df_s2[unhealthy_vars] <- 2 - df_s2[unhealthy_vars]

# -----------------------------
# Step 3: Compute total Diet Score
# -----------------------------
df_s2$diet_score <- rowSums(df_s2[diet_vars], na.rm = TRUE)
df_s2$diet_unhealthy_score <- rowSums(df_s2[unhealthy_vars], na.rm = TRUE)

# Optional: scale 0–100
max_score <- length(diet_vars) * 2
df_s2$diet_score_0_100 <- df_s2$diet_score / max_score * 100

# -----------------------------
# Step 4: Variables to plot
# -----------------------------
vars_to_plot2 <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",
  "diet3..How.many.servings.of.vegetables.did.you.ea",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",
  "diet_score_0_100",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",
  "diet5..How.many.times.a.week.did.you.eat.beans..l",
  "diet7..How.many.times.a.week.did.you.eat.desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do",
  "diet9..How.many.servings.of.fruit.juice.did.you.d",
  "diet6..How.many.times.a.week.did.you.eat.regular.",
  "dmlpor..When.reflecting.on.your.typical.eating.ha",
  "dmlfrveg..In.a.typical.day..how.many.servings.of.",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons",
  "dmlsugar..When.reflecting.on.your.typical.eating."
)

# Keep only variables that exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_s2)]

short_names2 <- c(
  "diet_score_0_100" = "Overall Diet Score",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
  "diet3..How.many.servings.of.vegetables.did.you.ea" = "Weekly Vegetables",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac" = "Weekly Fruit",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet" = "Weekly Soda",
  "diet5..How.many.times.a.week.did.you.eat.beans..l"= "Weekly Beans",
  "diet7..How.many.times.a.week.did.you.eat.desserts" =  "Weekly Desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do" =  "Weekly Butter & Meat",
  "diet9..How.many.servings.of.fruit.juice.did.you.d" =  "Weekly Fruit juice",
  "diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
  "dmlpor..When.reflecting.on.your.typical.eating.ha" = "Typical Portion",
  "dmlfrveg..In.a.typical.day..how.many.servings.of." = "Fruit & Veg Intake",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
  "dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake"
)

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]

# -----------------------------
# Step 5: Generate plots
# -----------------------------
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 6)
big_panel2

# -----------------------------
# Step 6: Summarize numeric vs categorical
# -----------------------------
num_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], function(x) !is.numeric(x))]

# Numeric summaries
num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

# Categorical summaries
cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

# Add description row
description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

# Identify significant rows
p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col <- c("Number of missing values per variable", missing_summary)
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 2. Diet Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")

Study 2. Diet Variables by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Weekly Fastfood	1.44 ± 0.51	1.60 ± 0.58	1.52 ± 0.54	0.303	6 (10.7%)
Weekly Vegetables	1.60 ± 0.50	1.40 ± 0.58	1.50 ± 0.54	0.197	6 (10.7%)
Weekly Fruit	1.56 ± 0.58	1.75 ± 0.53	1.65 ± 0.56	0.239	7 (12.5%)
Overall Diet Score	66.67 ± 17.43	62.59 ± 29.68	64.48 ± 24.63	0.528	0 (0%)
Weekly Soda	1.68 ± 0.56	1.88 ± 0.33	1.78 ± 0.46	0.131	6 (10.7%)
Weekly Beans	0.68 ± 0.69	0.40 ± 0.58	0.54 ± 0.65	0.127	6 (10.7%)
Weekly Desserts	1.20 ± 0.76	1.52 ± 0.65	1.36 ± 0.72	0.118	6 (10.7%)
Weekly Butter & Meat	1.32 ± 0.48	1.48 ± 0.59	1.40 ± 0.53	0.295	6 (10.7%)
Weekly Fruit juice	1.52 ± 0.65	1.88 ± 0.33	1.70 ± 0.54	0.019 *	6 (10.7%)
Regular Salty Snacks per Week	1.48 ± 0.65	1.68 ± 0.48	1.58 ± 0.57	0.223	6 (10.7%)
Typical Portion	6.15 ± 2.94	7.33 ± 3.14	6.79 ± 3.08	0.153	0 (0%)
Fruit & Veg Intake	5.77 ± 2.32	5.52 ± 2.79	5.64 ± 2.56	0.716	1 (1.8%)
Typical Grain Intake	7.31 ± 3.80	7.17 ± 3.13	7.23 ± 3.43	0.881	0 (0%)
Typical Sugar Intake	3.65 ± 3.02	5.17 ± 3.07	4.46 ± 3.12	0.069	0 (0%)

3.7 HR/HRV

3.7.1 Study 1

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]

df_test$hrv_rmssd_clean <- pmin(
  pmax(df_test$hrv_rmssd, 5),  # lower bound
  100                          # upper bound
)

num_changed_rmssd <- sum(df_test$hrv_rmssd_clean != df_test$hrv_rmssd, na.rm = TRUE)

vars_to_plot2 <- c(
  "hr_from_ecg",
  "hrv_rmssd_clean",
  "stress_overall_daily_mean_stress")
short_names2 <- c(
  "hr_from_ecg" = "Resting HR (ECG)",
  "hrv_rmssd_clean" = "HRV RMSSD (ECG)",
  "stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)")

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     # <- same function you already created
  data = df_test,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]

big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_test %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_test[[var]], df_test$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_test[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_test), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)

# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 1. HR/HRV by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")

Study 1. HR/HRV by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Resting HR (ECG)	69.03 ± 11.47	64.99 ± 11.84	66.83 ± 11.80	0.049 *	6 (4.3%)
HRV RMSSD (ECG)	32.56 ± 26.10	35.84 ± 26.35	34.35 ± 26.19	0.476	6 (4.3%)
Daily Stress score (Garmin)	27.76 ± 13.97	24.34 ± 13.48	25.88 ± 13.76	0.149	0 (0%)

3.8 Study 2

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_s2)]

df_s2$hrv_rmssd_clean <- pmin(
  pmax(df_s2$hrv_rmssd, 5),  # lower bound
  100                         # upper bound
)

num_changed_rmssd <- sum(df_s2$hrv_rmssd_clean != df_s2$hrv_rmssd, na.rm = TRUE)

vars_to_plot2 <- c(
  "hr_from_ecg",
  "hrv_rmssd_clean",
  "stress_overall_daily_mean_stress"
)

short_names2 <- c(
  "hr_from_ecg" = "Resting HR (ECG)",
  "hrv_rmssd_clean" = "HRV RMSSD (ECG)",
  "stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)"
)

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     # <- same function you already created
  data = df_s2,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col <- c("Number of missing values per variable", missing_summary)
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 2. HR/HRV by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")

Study 2. HR/HRV by IR Status
Variable	IR	Non_IR	Total	p-value	Missing (n, %)
Note:	IR = mean ± SD or count (%)	Non-IR = mean ± SD or count (%)	Total = mean ± SD or count (%)	t-test or χ² test	Number of missing values per variable
Resting HR (ECG)	64.38 ± 11.94	56.83 ± 6.07	60.40 ± 9.98	0.006 **	1 (1.8%)
HRV RMSSD (ECG)	33.94 ± 31.56	30.25 ± 23.12	32.00 ± 27.24	0.626	1 (1.8%)
Daily Stress score (Garmin)	19.44 ± 13.86	22.33 ± 11.69	20.96 ± 12.73	0.410	1 (1.8%)

4 Results

5 Model development

5.0.1 Features

library(tidyverse)
library(knitr)
library(kableExtra)

# -------------------------------
# 1. Define subset of features
# -------------------------------

X <- df_test %>%
  dplyr::select(
    fh_dm2sb_cat,
    years_of_education, 
    age, 
    bmi_vsorres..BMI, 
    whr_vsorres..Waist.to.Hip.Ratio..WHR.,
    pulse_vsorres..Heart.Rate..bpm.,
    sualcage, 
    sualckncf,
    sri,
    mean_pct_steps_morning,
    mean_pct_steps_evening,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    hr_from_ecg,
    mean_active_minutes,
    diet6..How.many.times.a.week.did.you.eat.regular.,
    diet1..How.many.times.a.week.did.you.eat.fast.foo,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    cv_total_steps,
    stress_overall_daily_mean_stress
  ) %>%
  mutate(across(where(is.factor), ~ factor(trimws(.))))

selected_features <- colnames(X)

# -------------------------------
# 2. Feature categories
# -------------------------------

feature_categories <- tibble(
  Feature = selected_features,
  Category = c(
    "Demographics",           # fh_dm2sb_cat
    "Demographics",           # years_of_education
    "Demographics",           # age
    "Anthropometrics & Vitals", # bmi_vsorres..BMI
    "Anthropometrics & Vitals", # whr_vsorres..Waist.to.Hip.Ratio..WHR.
    "Anthropometrics & Vitals", # pulse_vsorres..Heart.Rate..bpm.
    "Alcohol / Substance Use",  # sualcage
    "Alcohol / Substance Use",  # sualckncf
    "Sleep",                    # sri
    "Physical Activity",        # mean_pct_steps_morning
    "Physical Activity",        # mean_pct_steps_evening
    "Physical Activity",        # dmlhex..How.often.would.you.say.that.you.engage.i
    "Physical Activity",        # dmlvex..In.a.typical.week..how.often.do.you.engag
    "ECG / Heart Rate & HRV",   # hr_from_ecg
    "Physical Activity",        # mean_active_minutes
    "Nutrition",                # diet6..How.many.times.a.week.did.you.eat.regular.
    "Nutrition",                # diet1..How.many.times.a.week.did.you.eat.fast.foo
    "Nutrition",                # dmlgrain..In.a.typical.day..how.often.do.you.cons
    "Nutrition",                # dmlsugar..When.reflecting.on.your.typical.eating.
    "Physical Activity",        # cv_total_steps
    "Stress"                    # stress_overall_daily_mean_stress
  )
)

# -------------------------------
# 3. Short names dictionary
# -------------------------------

short_names_all <- c(
  "bmi_vsorres..BMI" = "BMI",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR." = "WaistHip_Ratio",
  "pulse_vsorres..Heart.Rate..bpm." = "PulseHR",
  "age" = "Age",
  "fh_dm2sb_cat" = "Sibling T2D history",
  "years_of_education" = "Years of education",
  "sualcage" = "Total years drinking",
  "sualckncf" = "Wine in past year",
  "sri" = "Sleep Reg. Index",
  "mean_pct_steps_morning" = "Pct Steps Morning",
  "mean_pct_steps_evening" = "Pct Steps Evening",
  "dmlhex..How.often.would.you.say.that.you.engage.i" = "Home Exercise Frequency",
  "dmlvex..In.a.typical.week..how.often.do.you.engag" = "Vig Exercise Frequency",
  "hr_from_ecg" = "Resting HR (ECG)",
  "mean_active_minutes" = "Mean Active Minutes",
  "diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
  "dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake",
  "cv_total_steps" = "CV Total Steps",
  "stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)"
)

short_names_tbl <- tibble(
  Feature = names(short_names_all),
  Short_Name = unname(short_names_all)
)

# -------------------------------
# 4. Combine category + name (2 columns)
# -------------------------------

final_table <- feature_categories %>%
  left_join(short_names_tbl, by = "Feature") %>%
  mutate(`Feature / Short Name` = coalesce(Short_Name, Feature)) %>%
  select(Category, `Feature / Short Name`) %>%
  arrange(Category, `Feature / Short Name`)

# -------------------------------
# 5. Render 2-column table
# -------------------------------

kable(
  final_table,
  caption = "Selected Features by Category",
  col.names = c("Category", "Feature / Short Name")
) %>%
  kable_classic(full_width = TRUE)

Selected Features by Category
Category	Feature / Short Name
Alcohol / Substance Use	Total years drinking
Alcohol / Substance Use	Wine in past year
Anthropometrics & Vitals	BMI
Anthropometrics & Vitals	PulseHR
Anthropometrics & Vitals	WaistHip_Ratio
Demographics	Age
Demographics	Sibling T2D history
Demographics	Years of education
ECG / Heart Rate & HRV	Resting HR (ECG)
Nutrition	Regular Salty Snacks per Week
Nutrition	Typical Grain Intake
Nutrition	Typical Sugar Intake
Nutrition	Weekly Fastfood
Physical Activity	CV Total Steps
Physical Activity	Home Exercise Frequency
Physical Activity	Mean Active Minutes
Physical Activity	Pct Steps Evening
Physical Activity	Pct Steps Morning
Physical Activity	Vig Exercise Frequency
Sleep	Sleep Reg. Index
Stress	Daily Stress score (Garmin)

library(ggplot2)
library(viridis)

# Create feature counts table
feature_counts <- feature_categories %>%
  group_by(Category) %>%
  summarise(Count = n()) %>%
  ungroup() %>%
  mutate(Percent = round(100 * Count / sum(Count), 1))

# Wide horizontal stacked bar plot
ggplot(feature_counts, aes(x = 1, y = Count, fill = Category)) +
  geom_bar(stat = "identity", width = 0.6, color = "white") +
  geom_text(aes(label = paste0(Count, " (", Percent, "%)")), 
            position = position_stack(vjust = 0.5), color = "black", size = 5) +
  labs(title = "Feature Counts by Category", x = NULL, y = "Number of Features") +
  coord_flip(expand = FALSE) +
  scale_fill_viridis(discrete = TRUE, option = "C") +  # modern, contrasting colors
  theme_minimal(base_size = 12) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.y = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "right"
  ) +
  # Make plot very wide and not tall
  theme(aspect.ratio = 0.25)

5.0.2 Feature Correlations

# Select your variables
df_selected <- df_test %>%
  dplyr::select(
    years_of_education, 
    age, 
    bmi_vsorres..BMI, 
    whr_vsorres..Waist.to.Hip.Ratio..WHR.,
    pulse_vsorres..Heart.Rate..bpm.,
    sualcage, 
    sualckncf,
    sri,
    mean_pct_steps_morning,
    mean_pct_steps_afternoon,
    mean_pct_steps_evening,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlact..What.would.you.consider.your.typical.acti,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    hr_from_ecg,
    diet7..How.many.times.a.week.did.you.eat.desserts,
    diet6..How.many.times.a.week.did.you.eat.regular.,
    dmlpor..When.reflecting.on.your.typical.eating.ha,
    dmlfrveg..In.a.typical.day..how.many.servings.of.,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    sd_wake_clock_h,
    sd_onset_clock_h,
    cv_total_steps,
    stress_overall_daily_mean_stress
  )

feature_categories <- tidyr::tibble(
  Feature = colnames(df_selected),
  Category = c(
    "Demographics", "Demographics",  # years_of_education, age
    "Anthropometrics & Vitals", "Anthropometrics & Vitals", "Anthropometrics & Vitals", # BMI, WHR, Pulse
    "Alcohol / Substance Use", "Alcohol / Substance Use", # sualcage, sualckncf
    "Stress", # sri
    "ECG / Heart Rate & HRV", "ECG / Heart Rate & HRV", 
    "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", # steps and activity features
    "Sleep", "Sleep", # sd_wake_clock_h, sd_onset_clock_h
    "Nutrition", "Nutrition", "Nutrition", "Nutrition", "Nutrition", "Nutrition" # diet features
  )
)

# ------------------------------
# 2. Count features per category
# ------------------------------
feature_counts <- feature_categories %>%
  group_by(Category) %>%
  summarise(Count = n(), .groups = "drop") %>%
  mutate(Percent = round(Count / sum(Count) * 100, 1))

# ------------------------------
# 3. Stacked bar plot
# ------------------------------
ggplot(feature_counts, aes(x = "", y = Count, fill = Category)) +
  geom_bar(stat = "identity", width = 0.7) +  # thicker bars
  geom_text(aes(label = paste0(Count, " (", Percent, "%)")), 
            position = position_stack(vjust = 0.5), color = "black", size = 5) +
  labs(title = "Feature Counts by Category", x = "", y = "Number of Features") +
  coord_flip(expand = FALSE) +  # remove extra padding for wider look
  scale_fill_brewer(palette = "Blues") +
  theme_classic(base_size = 10) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

# Keep only numeric columns
df_numeric <- df_selected %>%
  select(where(is.numeric))

# Compute correlation matrix
cor_matrix <- cor(df_numeric, use = "pairwise.complete.obs")

# 1. Compute correlation matrix
cor_matrix <- cor(df_numeric, use = "pairwise.complete.obs")
cor_matrix[is.na(cor_matrix)] <- 0  # handle zero-variance columns

# 2. Convert to long table
cor_table <- as.data.frame(as.table(cor_matrix)) %>%
  filter(Var1 != Var2)  # remove self-correlations

# 3. Remove duplicate pairs (A,B) == (B,A)
cor_table_unique <- cor_table %>%
  rowwise() %>%
  mutate(pair = paste(sort(c(Var1, Var2)), collapse = "_")) %>%
  distinct(pair, .keep_all = TRUE) %>%
  ungroup() %>%
  dplyr::select(Var1, Var2, Freq)

# 4. Rename Freq to Correlation
cor_table_unique <- cor_table_unique %>%
  dplyr::rename(Correlation = Freq)

# 5. Sort descending by absolute correlation
cor_table_unique <- cor_table_unique %>%
  dplyr::arrange(desc(abs(Correlation)))

# 6. Print table with kable
knitr::kable(cor_table_unique, digits = 2, caption = "All Correlations (Sorted by |Correlation|)")

All Correlations (Sorted by |Correlation|)
Var1	Var2	Correlation
hr_from_ecg	pulse_vsorres..Heart.Rate..bpm.	0.75
cv_total_steps	mean_pct_steps_afternoon	-0.63
dmlsugar..When.reflecting.on.your.typical.eating.	diet7..How.many.times.a.week.did.you.eat.desserts	0.62
cv_total_steps	mean_pct_steps_evening	-0.59
sd_onset_clock_h	sd_wake_clock_h	0.51
stress_overall_daily_mean_stress	hr_from_ecg	0.51
dmlvex..In.a.typical.week..how.often.do.you.engag	dmlact..What.would.you.consider.your.typical.acti	0.50
dmlact..What.would.you.consider.your.typical.acti	dmlhex..How.often.would.you.say.that.you.engage.i	0.49
cv_total_steps	mean_pct_steps_morning	-0.49
stress_overall_daily_mean_stress	mean_pct_steps_afternoon	0.48
stress_overall_daily_mean_stress	pulse_vsorres..Heart.Rate..bpm.	0.46
stress_overall_daily_mean_stress	cv_total_steps	-0.46
mean_pct_steps_afternoon	mean_pct_steps_morning	0.41
dmlfrveg..In.a.typical.day..how.many.servings.of.	dmlhex..How.often.would.you.say.that.you.engage.i	0.40
diet6..How.many.times.a.week.did.you.eat.regular.	diet7..How.many.times.a.week.did.you.eat.desserts	0.38
diet6..How.many.times.a.week.did.you.eat.regular.	dmlvex..In.a.typical.week..how.often.do.you.engag	0.35
dmlact..What.would.you.consider.your.typical.acti	bmi_vsorres..BMI	-0.34
dmlvex..In.a.typical.week..how.often.do.you.engag	dmlhex..How.often.would.you.say.that.you.engage.i	0.34
dmlhex..How.often.would.you.say.that.you.engage.i	bmi_vsorres..BMI	-0.33
dmlfrveg..In.a.typical.day..how.many.servings.of.	dmlvex..In.a.typical.week..how.often.do.you.engag	0.31
sd_wake_clock_h	mean_pct_steps_morning	-0.30
stress_overall_daily_mean_stress	mean_pct_steps_evening	0.29
stress_overall_daily_mean_stress	mean_pct_steps_morning	0.29
dmlfrveg..In.a.typical.day..how.many.servings.of.	dmlact..What.would.you.consider.your.typical.acti	0.29
dmlgrain..In.a.typical.day..how.often.do.you.cons	years_of_education	0.28
dmlgrain..In.a.typical.day..how.often.do.you.cons	dmlpor..When.reflecting.on.your.typical.eating.ha	0.27
dmlgrain..In.a.typical.day..how.often.do.you.cons	diet6..How.many.times.a.week.did.you.eat.regular.	0.27
dmlsugar..When.reflecting.on.your.typical.eating.	diet6..How.many.times.a.week.did.you.eat.regular.	0.27
sd_wake_clock_h	mean_pct_steps_evening	0.26
dmlvex..In.a.typical.week..how.often.do.you.engag	bmi_vsorres..BMI	-0.26
dmlgrain..In.a.typical.day..how.often.do.you.cons	dmlfrveg..In.a.typical.day..how.many.servings.of.	0.26
sd_wake_clock_h	hr_from_ecg	0.26
dmlsugar..When.reflecting.on.your.typical.eating.	dmlvex..In.a.typical.week..how.often.do.you.engag	0.26
mean_pct_steps_evening	mean_pct_steps_afternoon	0.25
mean_pct_steps_evening	age	-0.25
bmi_vsorres..BMI	age	-0.25
dmlfrveg..In.a.typical.day..how.many.servings.of.	dmlpor..When.reflecting.on.your.typical.eating.ha	0.24
stress_overall_daily_mean_stress	sd_onset_clock_h	0.24
mean_pct_steps_afternoon	bmi_vsorres..BMI	0.24
stress_overall_daily_mean_stress	bmi_vsorres..BMI	0.24
mean_pct_steps_morning	sualcage	0.24
sd_wake_clock_h	years_of_education	-0.23
dmlsugar..When.reflecting.on.your.typical.eating.	dmlpor..When.reflecting.on.your.typical.eating.ha	0.23
sd_onset_clock_h	mean_pct_steps_evening	0.23
stress_overall_daily_mean_stress	dmlact..What.would.you.consider.your.typical.acti	-0.23
sd_onset_clock_h	pulse_vsorres..Heart.Rate..bpm.	0.23
hr_from_ecg	bmi_vsorres..BMI	0.22
diet6..How.many.times.a.week.did.you.eat.regular.	dmlhex..How.often.would.you.say.that.you.engage.i	0.22
pulse_vsorres..Heart.Rate..bpm.	bmi_vsorres..BMI	0.22
stress_overall_daily_mean_stress	dmlvex..In.a.typical.week..how.often.do.you.engag	-0.22
sd_onset_clock_h	hr_from_ecg	0.22
dmlfrveg..In.a.typical.day..how.many.servings.of.	years_of_education	0.22
hr_from_ecg	sualckncf	0.22
bmi_vsorres..BMI	years_of_education	-0.22
mean_pct_steps_morning	age	0.21
hr_from_ecg	age	-0.21
dmlsugar..When.reflecting.on.your.typical.eating.	dmlgrain..In.a.typical.day..how.often.do.you.cons	0.21
hr_from_ecg	dmlact..What.would.you.consider.your.typical.acti	-0.20
dmlgrain..In.a.typical.day..how.often.do.you.cons	dmlhex..How.often.would.you.say.that.you.engage.i	0.20
sri	age	-0.20
sd_onset_clock_h	sualcage	0.20
hr_from_ecg	dmlhex..How.often.would.you.say.that.you.engage.i	-0.20
hr_from_ecg	dmlvex..In.a.typical.week..how.often.do.you.engag	-0.19
pulse_vsorres..Heart.Rate..bpm.	age	-0.19
dmlhex..How.often.would.you.say.that.you.engage.i	pulse_vsorres..Heart.Rate..bpm.	-0.19
dmlfrveg..In.a.typical.day..how.many.servings.of.	bmi_vsorres..BMI	-0.19
stress_overall_daily_mean_stress	sualcage	0.18
dmlact..What.would.you.consider.your.typical.acti	sualckncf	-0.18
dmlgrain..In.a.typical.day..how.often.do.you.cons	dmlact..What.would.you.consider.your.typical.acti	0.18
cv_total_steps	age	0.18
dmlvex..In.a.typical.week..how.often.do.you.engag	sri	0.18
dmlgrain..In.a.typical.day..how.often.do.you.cons	diet7..How.many.times.a.week.did.you.eat.desserts	0.18
stress_overall_daily_mean_stress	sualckncf	0.18
dmlact..What.would.you.consider.your.typical.acti	years_of_education	0.17
diet6..How.many.times.a.week.did.you.eat.regular.	dmlact..What.would.you.consider.your.typical.acti	0.17
dmlsugar..When.reflecting.on.your.typical.eating.	sri	0.17
stress_overall_daily_mean_stress	dmlhex..How.often.would.you.say.that.you.engage.i	-0.17
dmlhex..How.often.would.you.say.that.you.engage.i	sualckncf	-0.17
stress_overall_daily_mean_stress	years_of_education	-0.17
cv_total_steps	sd_onset_clock_h	-0.17
age	years_of_education	0.17
dmlhex..How.often.would.you.say.that.you.engage.i	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.16
stress_overall_daily_mean_stress	dmlfrveg..In.a.typical.day..how.many.servings.of.	-0.16
sri	years_of_education	0.16
dmlpor..When.reflecting.on.your.typical.eating.ha	dmlhex..How.often.would.you.say.that.you.engage.i	0.16
cv_total_steps	sualcage	-0.16
dmlfrveg..In.a.typical.day..how.many.servings.of.	hr_from_ecg	-0.16
dmlfrveg..In.a.typical.day..how.many.servings.of.	sualckncf	-0.16
diet7..How.many.times.a.week.did.you.eat.desserts	sri	0.16
dmlfrveg..In.a.typical.day..how.many.servings.of.	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.15
dmlvex..In.a.typical.week..how.often.do.you.engag	years_of_education	0.15
diet7..How.many.times.a.week.did.you.eat.desserts	dmlvex..In.a.typical.week..how.often.do.you.engag	0.15
sd_onset_clock_h	dmlsugar..When.reflecting.on.your.typical.eating.	-0.15
sd_onset_clock_h	years_of_education	-0.15
sd_onset_clock_h	dmlact..What.would.you.consider.your.typical.acti	-0.15
sd_onset_clock_h	age	-0.15
dmlfrveg..In.a.typical.day..how.many.servings.of.	pulse_vsorres..Heart.Rate..bpm.	-0.15
sd_onset_clock_h	bmi_vsorres..BMI	0.15
dmlgrain..In.a.typical.day..how.often.do.you.cons	dmlvex..In.a.typical.week..how.often.do.you.engag	0.15
diet7..How.many.times.a.week.did.you.eat.desserts	mean_pct_steps_afternoon	0.15
dmlvex..In.a.typical.week..how.often.do.you.engag	pulse_vsorres..Heart.Rate..bpm.	-0.15
dmlvex..In.a.typical.week..how.often.do.you.engag	sualcage	0.15
diet7..How.many.times.a.week.did.you.eat.desserts	years_of_education	0.14
dmlsugar..When.reflecting.on.your.typical.eating.	dmlact..What.would.you.consider.your.typical.acti	0.14
stress_overall_daily_mean_stress	sd_wake_clock_h	0.14
dmlvex..In.a.typical.week..how.often.do.you.engag	mean_pct_steps_afternoon	-0.14
sd_wake_clock_h	bmi_vsorres..BMI	0.14
sualcage	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.14
hr_from_ecg	years_of_education	-0.14
mean_pct_steps_evening	years_of_education	-0.14
dmlsugar..When.reflecting.on.your.typical.eating.	pulse_vsorres..Heart.Rate..bpm.	-0.14
dmlfrveg..In.a.typical.day..how.many.servings.of.	sri	0.13
sd_wake_clock_h	diet7..How.many.times.a.week.did.you.eat.desserts	0.13
diet6..How.many.times.a.week.did.you.eat.regular.	mean_pct_steps_afternoon	0.13
hr_from_ecg	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.13
pulse_vsorres..Heart.Rate..bpm.	years_of_education	-0.13
dmlhex..How.often.would.you.say.that.you.engage.i	years_of_education	0.13
dmlgrain..In.a.typical.day..how.often.do.you.cons	pulse_vsorres..Heart.Rate..bpm.	-0.13
mean_pct_steps_afternoon	pulse_vsorres..Heart.Rate..bpm.	0.13
sd_onset_clock_h	dmlhex..How.often.would.you.say.that.you.engage.i	-0.13
dmlvex..In.a.typical.week..how.often.do.you.engag	sualckncf	-0.13
dmlact..What.would.you.consider.your.typical.acti	pulse_vsorres..Heart.Rate..bpm.	-0.13
dmlsugar..When.reflecting.on.your.typical.eating.	dmlfrveg..In.a.typical.day..how.many.servings.of.	0.13
dmlfrveg..In.a.typical.day..how.many.servings.of.	sualcage	0.13
sd_wake_clock_h	pulse_vsorres..Heart.Rate..bpm.	0.13
dmlpor..When.reflecting.on.your.typical.eating.ha	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.12
dmlact..What.would.you.consider.your.typical.acti	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.12
dmlsugar..When.reflecting.on.your.typical.eating.	years_of_education	0.12
dmlpor..When.reflecting.on.your.typical.eating.ha	dmlact..What.would.you.consider.your.typical.acti	0.12
dmlhex..How.often.would.you.say.that.you.engage.i	age	0.12
dmlact..What.would.you.consider.your.typical.acti	age	0.12
sd_onset_clock_h	dmlgrain..In.a.typical.day..how.often.do.you.cons	-0.11
dmlfrveg..In.a.typical.day..how.many.servings.of.	diet6..How.many.times.a.week.did.you.eat.regular.	0.11
dmlsugar..When.reflecting.on.your.typical.eating.	hr_from_ecg	-0.11
stress_overall_daily_mean_stress	sri	-0.11
diet7..How.many.times.a.week.did.you.eat.desserts	dmlhex..How.often.would.you.say.that.you.engage.i	0.11
stress_overall_daily_mean_stress	dmlsugar..When.reflecting.on.your.typical.eating.	-0.11
sd_wake_clock_h	age	-0.11
mean_pct_steps_morning	years_of_education	0.11
dmlgrain..In.a.typical.day..how.often.do.you.cons	bmi_vsorres..BMI	-0.11
whr_vsorres..Waist.to.Hip.Ratio..WHR.	age	0.11
dmlgrain..In.a.typical.day..how.often.do.you.cons	mean_pct_steps_morning	0.11
hr_from_ecg	mean_pct_steps_morning	-0.10
dmlpor..When.reflecting.on.your.typical.eating.ha	mean_pct_steps_morning	-0.10
sd_onset_clock_h	sualckncf	-0.10
cv_total_steps	dmlsugar..When.reflecting.on.your.typical.eating.	0.10
diet7..How.many.times.a.week.did.you.eat.desserts	hr_from_ecg	-0.10
dmlfrveg..In.a.typical.day..how.many.servings.of.	mean_pct_steps_afternoon	-0.10
dmlfrveg..In.a.typical.day..how.many.servings.of.	diet7..How.many.times.a.week.did.you.eat.desserts	0.10
cv_total_steps	dmlact..What.would.you.consider.your.typical.acti	0.10
cv_total_steps	bmi_vsorres..BMI	-0.10
mean_pct_steps_morning	bmi_vsorres..BMI	0.10
hr_from_ecg	sri	-0.10
hr_from_ecg	mean_pct_steps_evening	0.10
dmlpor..When.reflecting.on.your.typical.eating.ha	pulse_vsorres..Heart.Rate..bpm.	-0.10
stress_overall_daily_mean_stress	age	-0.10
hr_from_ecg	mean_pct_steps_afternoon	0.10
sd_wake_clock_h	dmlsugar..When.reflecting.on.your.typical.eating.	-0.10
mean_pct_steps_afternoon	sri	-0.10
dmlpor..When.reflecting.on.your.typical.eating.ha	dmlvex..In.a.typical.week..how.often.do.you.engag	0.10
dmlact..What.would.you.consider.your.typical.acti	sri	0.09
cv_total_steps	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.09
cv_total_steps	diet7..How.many.times.a.week.did.you.eat.desserts	-0.09
stress_overall_daily_mean_stress	diet7..How.many.times.a.week.did.you.eat.desserts	0.09
sri	sualcage	0.09
sd_wake_clock_h	dmlvex..In.a.typical.week..how.often.do.you.engag	-0.09
sd_wake_clock_h	dmlact..What.would.you.consider.your.typical.acti	-0.09
cv_total_steps	sri	-0.09
sd_wake_clock_h	diet6..How.many.times.a.week.did.you.eat.regular.	0.09
dmlvex..In.a.typical.week..how.often.do.you.engag	age	0.09
dmlsugar..When.reflecting.on.your.typical.eating.	mean_pct_steps_evening	-0.09
sualckncf	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.09
sualckncf	bmi_vsorres..BMI	0.09
sd_wake_clock_h	mean_pct_steps_afternoon	0.09
diet6..How.many.times.a.week.did.you.eat.regular.	years_of_education	0.08
diet7..How.many.times.a.week.did.you.eat.desserts	pulse_vsorres..Heart.Rate..bpm.	-0.08
mean_pct_steps_evening	pulse_vsorres..Heart.Rate..bpm.	0.08
diet6..How.many.times.a.week.did.you.eat.regular.	hr_from_ecg	-0.08
dmlpor..When.reflecting.on.your.typical.eating.ha	diet7..How.many.times.a.week.did.you.eat.desserts	0.08
dmlact..What.would.you.consider.your.typical.acti	mean_pct_steps_afternoon	-0.08
mean_pct_steps_afternoon	years_of_education	-0.08
diet6..How.many.times.a.week.did.you.eat.regular.	sualcage	-0.08
cv_total_steps	dmlhex..How.often.would.you.say.that.you.engage.i	-0.08
whr_vsorres..Waist.to.Hip.Ratio..WHR.	years_of_education	-0.08
mean_pct_steps_evening	sualcage	0.08
sd_onset_clock_h	dmlfrveg..In.a.typical.day..how.many.servings.of.	0.08
diet6..How.many.times.a.week.did.you.eat.regular.	mean_pct_steps_evening	0.08
mean_pct_steps_evening	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.07
dmlhex..How.often.would.you.say.that.you.engage.i	sri	0.07
stress_overall_daily_mean_stress	dmlgrain..In.a.typical.day..how.often.do.you.cons	-0.07
cv_total_steps	pulse_vsorres..Heart.Rate..bpm.	-0.07
dmlhex..How.often.would.you.say.that.you.engage.i	mean_pct_steps_evening	0.07
sri	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.07
dmlpor..When.reflecting.on.your.typical.eating.ha	sualcage	0.07
diet6..How.many.times.a.week.did.you.eat.regular.	sri	-0.07
dmlsugar..When.reflecting.on.your.typical.eating.	dmlhex..How.often.would.you.say.that.you.engage.i	0.07
dmlsugar..When.reflecting.on.your.typical.eating.	bmi_vsorres..BMI	-0.07
sualcage	pulse_vsorres..Heart.Rate..bpm.	-0.07
mean_pct_steps_evening	sualckncf	-0.07
cv_total_steps	dmlgrain..In.a.typical.day..how.often.do.you.cons	0.07
sd_onset_clock_h	mean_pct_steps_morning	-0.07
dmlpor..When.reflecting.on.your.typical.eating.ha	mean_pct_steps_evening	-0.06
dmlsugar..When.reflecting.on.your.typical.eating.	mean_pct_steps_morning	-0.06
dmlpor..When.reflecting.on.your.typical.eating.ha	diet6..How.many.times.a.week.did.you.eat.regular.	0.06
diet6..How.many.times.a.week.did.you.eat.regular.	mean_pct_steps_morning	0.06
dmlact..What.would.you.consider.your.typical.acti	sualcage	0.06
cv_total_steps	dmlpor..When.reflecting.on.your.typical.eating.ha	0.06
dmlpor..When.reflecting.on.your.typical.eating.ha	age	0.06
dmlvex..In.a.typical.week..how.often.do.you.engag	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.06
dmlfrveg..In.a.typical.day..how.many.servings.of.	mean_pct_steps_morning	-0.06
sualckncf	pulse_vsorres..Heart.Rate..bpm.	0.06
sri	bmi_vsorres..BMI	-0.06
dmlgrain..In.a.typical.day..how.often.do.you.cons	sualckncf	0.06
sualcage	bmi_vsorres..BMI	0.06
dmlgrain..In.a.typical.day..how.often.do.you.cons	age	0.06
sd_onset_clock_h	dmlpor..When.reflecting.on.your.typical.eating.ha	0.06
dmlgrain..In.a.typical.day..how.often.do.you.cons	mean_pct_steps_evening	-0.06
sri	pulse_vsorres..Heart.Rate..bpm.	-0.06
pulse_vsorres..Heart.Rate..bpm.	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.06
mean_pct_steps_morning	sri	-0.06
sd_wake_clock_h	sualckncf	-0.06
sd_wake_clock_h	sri	-0.06
dmlsugar..When.reflecting.on.your.typical.eating.	mean_pct_steps_afternoon	-0.06
sd_wake_clock_h	dmlgrain..In.a.typical.day..how.often.do.you.cons	-0.05
sd_onset_clock_h	dmlvex..In.a.typical.week..how.often.do.you.engag	-0.05
mean_pct_steps_afternoon	age	-0.05
dmlgrain..In.a.typical.day..how.often.do.you.cons	sualcage	0.05
sualcage	years_of_education	0.05
dmlvex..In.a.typical.week..how.often.do.you.engag	mean_pct_steps_evening	0.05
cv_total_steps	dmlvex..In.a.typical.week..how.often.do.you.engag	0.05
stress_overall_daily_mean_stress	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.05
dmlpor..When.reflecting.on.your.typical.eating.ha	sualckncf	0.05
dmlgrain..In.a.typical.day..how.often.do.you.cons	sri	-0.04
cv_total_steps	diet6..How.many.times.a.week.did.you.eat.regular.	-0.04
sd_wake_clock_h	sualcage	-0.04
diet7..How.many.times.a.week.did.you.eat.desserts	bmi_vsorres..BMI	0.04
dmlpor..When.reflecting.on.your.typical.eating.ha	sri	0.04
diet7..How.many.times.a.week.did.you.eat.desserts	age	0.04
cv_total_steps	sualckncf	0.04
diet6..How.many.times.a.week.did.you.eat.regular.	pulse_vsorres..Heart.Rate..bpm.	-0.04
dmlpor..When.reflecting.on.your.typical.eating.ha	bmi_vsorres..BMI	-0.04
dmlfrveg..In.a.typical.day..how.many.servings.of.	mean_pct_steps_evening	0.04
sd_onset_clock_h	mean_pct_steps_afternoon	0.04
dmlact..What.would.you.consider.your.typical.acti	mean_pct_steps_evening	-0.04
sualckncf	years_of_education	-0.04
diet7..How.many.times.a.week.did.you.eat.desserts	sualckncf	-0.04
diet6..How.many.times.a.week.did.you.eat.regular.	age	-0.04
mean_pct_steps_morning	pulse_vsorres..Heart.Rate..bpm.	-0.04
sd_onset_clock_h	diet6..How.many.times.a.week.did.you.eat.regular.	-0.04
diet7..How.many.times.a.week.did.you.eat.desserts	sualcage	0.04
whr_vsorres..Waist.to.Hip.Ratio..WHR.	bmi_vsorres..BMI	0.03
mean_pct_steps_afternoon	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.03
mean_pct_steps_evening	mean_pct_steps_morning	-0.03
stress_overall_daily_mean_stress	diet6..How.many.times.a.week.did.you.eat.regular.	0.03
sd_wake_clock_h	dmlfrveg..In.a.typical.day..how.many.servings.of.	0.03
diet6..How.many.times.a.week.did.you.eat.regular.	sualckncf	-0.03
stress_overall_daily_mean_stress	dmlpor..When.reflecting.on.your.typical.eating.ha	-0.03
diet7..How.many.times.a.week.did.you.eat.desserts	mean_pct_steps_morning	0.03
dmlact..What.would.you.consider.your.typical.acti	mean_pct_steps_morning	0.03
cv_total_steps	hr_from_ecg	-0.03
dmlgrain..In.a.typical.day..how.often.do.you.cons	mean_pct_steps_afternoon	-0.03
sd_wake_clock_h	dmlhex..How.often.would.you.say.that.you.engage.i	0.03
dmlfrveg..In.a.typical.day..how.many.servings.of.	age	-0.02
diet7..How.many.times.a.week.did.you.eat.desserts	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.02
sd_onset_clock_h	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.02
diet6..How.many.times.a.week.did.you.eat.regular.	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.02
sd_wake_clock_h	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.02
sri	sualckncf	0.02
diet7..How.many.times.a.week.did.you.eat.desserts	dmlact..What.would.you.consider.your.typical.acti	0.02
sd_wake_clock_h	dmlpor..When.reflecting.on.your.typical.eating.ha	-0.02
cv_total_steps	sd_wake_clock_h	0.02
mean_pct_steps_morning	whr_vsorres..Waist.to.Hip.Ratio..WHR.	-0.02
dmlsugar..When.reflecting.on.your.typical.eating.	age	-0.02
dmlpor..When.reflecting.on.your.typical.eating.ha	hr_from_ecg	-0.02
dmlgrain..In.a.typical.day..how.often.do.you.cons	hr_from_ecg	-0.02
dmlvex..In.a.typical.week..how.often.do.you.engag	mean_pct_steps_morning	-0.02
dmlsugar..When.reflecting.on.your.typical.eating.	sualcage	-0.02
diet6..How.many.times.a.week.did.you.eat.regular.	bmi_vsorres..BMI	-0.02
diet7..How.many.times.a.week.did.you.eat.desserts	mean_pct_steps_evening	0.01
dmlsugar..When.reflecting.on.your.typical.eating.	sualckncf	-0.01
dmlpor..When.reflecting.on.your.typical.eating.ha	mean_pct_steps_afternoon	0.01
dmlhex..How.often.would.you.say.that.you.engage.i	mean_pct_steps_morning	0.01
dmlpor..When.reflecting.on.your.typical.eating.ha	years_of_education	-0.01
dmlgrain..In.a.typical.day..how.often.do.you.cons	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.01
sd_onset_clock_h	diet7..How.many.times.a.week.did.you.eat.desserts	0.01
mean_pct_steps_evening	sri	0.01
cv_total_steps	dmlfrveg..In.a.typical.day..how.many.servings.of.	0.01
sualcage	age	-0.01
cv_total_steps	years_of_education	0.01
dmlhex..How.often.would.you.say.that.you.engage.i	mean_pct_steps_afternoon	0.01
mean_pct_steps_morning	sualckncf	-0.01
sd_onset_clock_h	sri	0.00
mean_pct_steps_evening	bmi_vsorres..BMI	0.00
mean_pct_steps_afternoon	sualckncf	0.00
hr_from_ecg	sualcage	0.00
dmlhex..How.often.would.you.say.that.you.engage.i	sualcage	0.00
sualckncf	age	0.00
mean_pct_steps_afternoon	sualcage	0.00
dmlsugar..When.reflecting.on.your.typical.eating.	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.00
sualckncf	sualcage	0.00

# 7. Summary description
# Example summary_stats dataframe
summary_stats <- cor_table_unique %>%
  dplyr::summarize(
    Mean_Correlation = mean(Correlation, na.rm = TRUE),
    Median_Correlation = median(Correlation, na.rm = TRUE),
    Max_Correlation = max(Correlation, na.rm = TRUE),
    Min_Correlation = min(Correlation, na.rm = TRUE)
  )

# Display as horizontal table
kable(summary_stats, "html", caption = "Summary of Correlations") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, font_size = 14) %>%
  row_spec(0, bold = TRUE, background = "#A5D8FF")

Summary of Correlations
Mean_Correlation	Median_Correlation	Max_Correlation	Min_Correlation
0.0260519	0.0115898	0.7517405	-0.6285951

#hr_from_ecg pulse_vsorres..Heart.Rate..bpm.       0.752           0.752

5.0.3 Random Forest

X <- df_test %>%
  dplyr::select(
     fh_dm2sb_cat,
     years_of_education, 
     age, 
     bmi_vsorres..BMI, 
     whr_vsorres..Waist.to.Hip.Ratio..WHR.,
     pulse_vsorres..Heart.Rate..bpm.,
     sualcage, 
     sualckncf,
     #hrv_rmssd,
     sri,
     mean_pct_steps_morning,
     mean_pct_steps_evening,
     dmlhex..How.often.would.you.say.that.you.engage.i,
     dmlvex..In.a.typical.week..how.often.do.you.engag,
     hr_from_ecg,
     mean_active_minutes,
     diet6..How.many.times.a.week.did.you.eat.regular.,
     diet1..How.many.times.a.week.did.you.eat.fast.foo,
     dmlgrain..In.a.typical.day..how.often.do.you.cons,
     dmlsugar..When.reflecting.on.your.typical.eating.,
    cv_total_steps,
    stress_overall_daily_mean_stress) %>%
  mutate(across(where(is.factor), ~ factor(trimws(.))))

# ------------------------------
# 2. Define target variable
# ------------------------------
# 1. Clean target
X <- X %>% mutate(across(where(is.numeric), as.double))

y <- factor(trimws(df_test$IR_label))
levels(y) <- make.names(levels(y))  # ensures valid R names

# 2. Combine with predictors
df_model <- X
df_model$y <- y
#df_model <- na.omit(df_model)

X_clean <- df_model %>% select(-y)
y_clean <- df_model$y

# 3. Train/test split
set.seed(123)
train_index <- createDataPartition(y_clean, p = 0.8, list = FALSE)
X_train <- X_clean[train_index, ]
X_test  <- X_clean[-train_index, ]
y_train <- y_clean[train_index]
y_test  <- y_clean[-train_index]

# 4. Train RF
set.seed(123)
train_control <- trainControl(
  method = "cv",
  number = 10,
  sampling = "up" ,
  savePredictions = "final",
  classProbs = TRUE
)

set.seed(123)
model_rf <- train(
  x = X_train,
  y = y_train,
  method = "rf",
  trControl = train_control,
  preProcess = c("medianImpute"),
  tuneLength = 5
)

# 5. Evaluate
pred_class <- predict(model_rf, X_test)
cm <- caret::confusionMatrix(pred_class, y_test)

accuracy <- as.numeric(cm$overall["Accuracy"])
sensitivity <- as.numeric(cm$byClass["Sensitivity"])
specificity <- as.numeric(cm$byClass["Specificity"])
precision <- as.numeric(cm$byClass["Pos Pred Value"])
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)

# Compute Balanced Accuracy
balanced_accuracy <- (sensitivity + specificity) / 2

# Create metrics dataframe including Balanced Accuracy
metrics <- data.frame(
  Accuracy = accuracy,
  Sensitivity = sensitivity,
  Specificity = specificity,
  Balanced_Accuracy = balanced_accuracy,
  Precision = precision,
  F1_Score = f1_score
)

# Example metrics dataframe
metrics <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Balanced Accuracy", "Precision", "F1 Score"),
  Value = c(accuracy, sensitivity, specificity, balanced_accuracy, precision, f1_score)
)

# Create a nice HTML table
kable(metrics, "html", caption = "Model Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = T, font_size = 14) %>%
  row_spec(0, bold = TRUE, background = "#A5D8FF") %>%
  column_spec(2, bold = TRUE)

Model Performance Metrics
Metric	Value
Accuracy	0.7407407
Sensitivity	0.8333333
Specificity	0.6666667
Balanced Accuracy	0.7500000
Precision	0.6666667
F1 Score	0.7407407

# 2. ROC Curve & AUC
# ------------------------------
pred_probs <- predict(model_rf, X_test, type = "prob")[, "IR"]
roc_obj <- roc(y_test, pred_probs, levels = rev(levels(y_test)))

# ROC plot using ggplot2
roc_df <- data.frame(
  FPR = 1 - roc_obj$specificities,
  TPR = roc_obj$sensitivities
)

roc_plot = ggplot(roc_df, aes(x = FPR, y = TPR)) +
  geom_line(color = "#2C7BB6", size = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey50") +
  annotate("text", x = 0.6, y = 0.1, label = paste0("AUC = ", round(auc(roc_obj), 3)), size = 5) +
  labs(title = "ROC Curve", x = "False Positive Rate (1 - Specificity)", y = "True Positive Rate (Sensitivity)") +
  theme_classic(base_size = 14) +
  theme(aspect.ratio=0.9)


# ------------------------------
# 3. Precision-Recall Curve & PRAUC
# ------------------------------
# Convert y_test to binary numeric: 1 = IR, 0 = Non-IR
y_bin <- ifelse(y_test == "IR", 1, 0)

pr_obj <- pr.curve(scores.class0 = pred_probs, weights.class0 = y_bin, curve = TRUE)

pr_df <- data.frame(
  Recall = pr_obj$curve[,1],
  Precision = pr_obj$curve[,2]
)

pr_plot = ggplot(pr_df, aes(x = Recall, y = Precision)) +
  geom_line(color = "#D7191C", size = 1.2) +
  annotate("text", x = 0.6, y = 0.1, label = paste0("PRAUC = ", round(pr_obj$auc.integral, 3)), size = 5) +
  labs(title = "Precision-Recall Curve", x = "Recall (Sensitivity)", y = "Precision") +
  theme_classic(base_size = 14) +
  theme(aspect.ratio=0.9)


combined_plot <- roc_plot + pr_plot + plot_layout(ncol = 2)
combined_plot

# shap <- Shapley$new(predictor, x.interest = X_train[1, ]) # example: first row
# # If you want all rows, you can loop or sample (expensive for large datasets)
# 
# # Extract SHAP values for the first observation
# shap_values <- shap$results
# print(shap_values)

# ------------------------------
# 4. Summary plot (global feature importance via SHAP)
# ------------------------------
# shapley_summary <- FeatureImp$new(predictor, loss = "ce")  # cross-entropy loss for classification
# plot(shapley_summary)  # shows global feature importance

n_runs <- 50
all_var_imp <- matrix(NA, nrow = ncol(X_train), ncol = n_runs)
rownames(all_var_imp) <- colnames(X_train)

set.seed(123)
for (i in 1:n_runs) {
  train_idx <- createDataPartition(y_clean, p = 0.7, list = FALSE)
  X_tr <- X_clean[train_idx, ]
  X_te <- X_clean[-train_idx, ]
  y_tr <- y_clean[train_idx]
  y_te <- y_clean[-train_idx]
  
  rf <- train(
    x = X_tr,
    y = y_tr,
    method = "rf",
    trControl = trainControl(method = "cv", number = 5),
    preProcess = c("medianImpute"),
    tuneLength = 5
  )
  
  imp <- varImp(rf, scale = TRUE)$importance
  all_var_imp[, i] <- imp$Overall
}

var_imp_mean <- rowMeans(all_var_imp)
var_imp_sd <- apply(all_var_imp, 1, sd)

var_imp_stable <- data.frame(
  Feature = rownames(all_var_imp),
  MeanImportance = var_imp_mean,
  SDImportance = var_imp_sd
) %>% arrange(desc(MeanImportance))

print(head(var_imp_stable, 20))

##                                                                                             Feature
## bmi_vsorres..BMI                                                                   bmi_vsorres..BMI
## whr_vsorres..Waist.to.Hip.Ratio..WHR.                         whr_vsorres..Waist.to.Hip.Ratio..WHR.
## stress_overall_daily_mean_stress                                   stress_overall_daily_mean_stress
## hr_from_ecg                                                                             hr_from_ecg
## sri                                                                                             sri
## mean_active_minutes                                                             mean_active_minutes
## pulse_vsorres..Heart.Rate..bpm.                                     pulse_vsorres..Heart.Rate..bpm.
## mean_pct_steps_morning                                                       mean_pct_steps_morning
## mean_pct_steps_evening                                                       mean_pct_steps_evening
## age                                                                                             age
## cv_total_steps                                                                       cv_total_steps
## sualcage                                                                                   sualcage
## years_of_education                                                               years_of_education
## dmlhex..How.often.would.you.say.that.you.engage.i dmlhex..How.often.would.you.say.that.you.engage.i
## diet6..How.many.times.a.week.did.you.eat.regular. diet6..How.many.times.a.week.did.you.eat.regular.
## dmlvex..In.a.typical.week..how.often.do.you.engag dmlvex..In.a.typical.week..how.often.do.you.engag
## fh_dm2sb_cat                                                                           fh_dm2sb_cat
## dmlsugar..When.reflecting.on.your.typical.eating. dmlsugar..When.reflecting.on.your.typical.eating.
## dmlgrain..In.a.typical.day..how.often.do.you.cons dmlgrain..In.a.typical.day..how.often.do.you.cons
## diet1..How.many.times.a.week.did.you.eat.fast.foo diet1..How.many.times.a.week.did.you.eat.fast.foo
##                                                   MeanImportance SDImportance
## bmi_vsorres..BMI                                       97.794220     7.545864
## whr_vsorres..Waist.to.Hip.Ratio..WHR.                  72.945954    17.990147
## stress_overall_daily_mean_stress                       41.423222    19.519506
## hr_from_ecg                                            28.679576    15.919824
## sri                                                    26.796903    16.669371
## mean_active_minutes                                    26.728777    13.692341
## pulse_vsorres..Heart.Rate..bpm.                        26.363365    14.957959
## mean_pct_steps_morning                                 22.181617    11.449555
## mean_pct_steps_evening                                 21.277253    11.878361
## age                                                    20.500170    11.427062
## cv_total_steps                                         20.180248    11.926483
## sualcage                                               18.225723     9.409087
## years_of_education                                     17.030107    12.084327
## dmlhex..How.often.would.you.say.that.you.engage.i      15.056297    13.616977
## diet6..How.many.times.a.week.did.you.eat.regular.      14.953623     8.906635
## dmlvex..In.a.typical.week..how.often.do.you.engag       8.522026     7.229816
## fh_dm2sb_cat                                            6.963167     7.054268
## dmlsugar..When.reflecting.on.your.typical.eating.       4.843838     3.347394
## dmlgrain..In.a.typical.day..how.often.do.you.cons       4.475492     4.660171
## diet1..How.many.times.a.week.did.you.eat.fast.foo       3.068553     3.475977

# ------------------------------
# 9. SHAP values (conditional contribution)

library(randomForest)
library(iml)

# ------------------------------
predictor <- Predictor$new(
  model = model_rf,
  data = X_train,
  y = y_train,
  type = "prob"
)

# Example: compute SHAP for first 100 rows
sample_idx <- sample(1:nrow(X_train), min(100, nrow(X_train)))
shap_values_list <- lapply(sample_idx, function(i) {
  shap <- Shapley$new(predictor, x.interest = X_train[i, ])
  shap$results
})

shap_all <- do.call(rbind, shap_values_list)
shap_global <- shap_all %>%
  group_by(feature) %>%
  summarise(mean_abs_shap = mean(abs(phi))) %>%
  arrange(desc(mean_abs_shap))

# Combine SHAP results
shap_all <- do.call(rbind, shap_values_list)

# Summarize global SHAP values
shap_global <- shap_all %>%
  group_by(feature) %>%
  summarise(mean_abs_shap = mean(abs(phi)), .groups = "drop") %>%
  arrange(desc(mean_abs_shap))

# Create a nice table with kable
shap_global %>%
  mutate(rank = row_number()) %>%  # optional: add a rank column
  select(rank, everything()) %>%
  kable(
    caption = "Global SHAP Feature Importance",
    col.names = c("Rank", "Feature", "Mean |SHAP|"),
    digits = 3,
    align = c("c", "l", "c")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center"
  )

Global SHAP Feature Importance
Rank	Feature	Mean \|SHAP\|
1	bmi_vsorres..BMI	0.128
2	whr_vsorres..Waist.to.Hip.Ratio..WHR.	0.077
3	stress_overall_daily_mean_stress	0.028
4	diet6..How.many.times.a.week.did.you.eat.regular.	0.026
5	hr_from_ecg	0.023
6	dmlhex..How.often.would.you.say.that.you.engage.i	0.023
7	pulse_vsorres..Heart.Rate..bpm.	0.018
8	sri	0.015
9	mean_pct_steps_evening	0.014
10	age	0.014
11	years_of_education	0.014
12	cv_total_steps	0.014
13	mean_active_minutes	0.012
14	sualcage	0.011
15	mean_pct_steps_morning	0.011
16	fh_dm2sb_cat	0.010
17	dmlsugar..When.reflecting.on.your.typical.eating.	0.010
18	dmlvex..In.a.typical.week..how.often.do.you.engag	0.008
19	dmlgrain..In.a.typical.day..how.often.do.you.cons	0.005
20	sualckncf	0.005
21	diet1..How.many.times.a.week.did.you.eat.fast.foo	0.004

library(dplyr)
library(ggplot2)
library(tidyr)

# ------------------------------
# 1. Combine SHAP with categories
# ------------------------------
shap_categorized <- shap_global %>%
  left_join(feature_categories, by = c("feature" = "Feature"))

# ------------------------------
# 2. Compute total and percent contribution by category
# ------------------------------
category_shap <- shap_categorized %>%
  group_by(Category) %>%
  summarise(
    total_shap = sum(mean_abs_shap),
    .groups = "drop"
  ) %>%
  mutate(percent_contribution = total_shap / sum(total_shap) * 100)

# ------------------------------
# 3. Plot
# ------------------------------
ggplot(category_shap, aes(x = reorder(Category, -percent_contribution), y = percent_contribution, fill = Category)) +
  geom_bar(stat = "identity", color = "black") +
  geom_text(aes(label = paste0(round(percent_contribution, 1), "%")), vjust = -0.5, size = 4) +
  labs(
    title = "SHAP Feature Contributions by Category",
    x = "Feature Category",
    y = "Percent of Total Mean |SHAP|"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(0, max(category_shap$percent_contribution) * 1.1))

6 External validation

6.1 Confusion Matrix

df_external  <- read.csv("~/external_test.csv", header = TRUE, stringsAsFactors = FALSE)

unhealthy_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",  
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  
  "diet6..How.many.times.a.week.did.you.eat.regular.",  
  "diet7..How.many.times.a.week.did.you.eat.desserts",  
  "diet9..How.many.servings.of.fruit.juice.did.you.d"
)

df_external[unhealthy_vars] <- 2 - df_external[unhealthy_vars]
df_external$diet_unhealthy_score <- rowSums(df_external[unhealthy_vars], na.rm = TRUE)


predictor_names <- c(
  "fh_dm2sb_cat",
  "years_of_education",
  "age",
  "bmi_vsorres..BMI",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR.",
  "pulse_vsorres..Heart.Rate..bpm.",
  "sualcage",
  "sualckncf",
  "diet_unhealthy_score",
  "sri",
  "mean_pct_steps_morning",
  "mean_pct_steps_afternoon",
  "mean_pct_steps_evening",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",
  "dmlhex..How.often.would.you.say.that.you.engage.i",
  "dmlact..What.would.you.consider.your.typical.acti",
  "dmlvex..In.a.typical.week..how.often.do.you.engag",
  "hr_from_ecg",
  "mean_active_minutes",
  "diet7..How.many.times.a.week.did.you.eat.desserts",
  "diet6..How.many.times.a.week.did.you.eat.regular.",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",
  "diet9..How.many.servings.of.fruit.juice.did.you.d",
  "dmlpor..When.reflecting.on.your.typical.eating.ha",
  "dmlfrveg..In.a.typical.day..how.many.servings.of.",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons",
  "dmlsugar..When.reflecting.on.your.typical.eating.",
  "sd_wake_clock_h",
  "sd_onset_clock_h",
  "cv_total_steps",
  "stress_overall_daily_mean_stress")

df_external$fh_dm2sb_cat <- trimws(tolower(df_external$fh_dm2sb_cat))
df_external$fh_dm2sb_cat <- factor(df_external$fh_dm2sb_cat, levels=c("no","yes"))

mode_val <- names(sort(table(X_train$fh_dm2sb_cat), decreasing = TRUE))[1]

# Replace NA in external dataset
df_external$fh_dm2sb_cat[is.na(df_external$fh_dm2sb_cat)] <- mode_val

# Ensure factor levels match training set
df_external$fh_dm2sb_cat <- factor(df_external$fh_dm2sb_cat, levels = levels(X_train$fh_dm2sb_cat))

X_ext <- df_external %>% dplyr::select(all_of(predictor_names))

pre_proc <- preProcess(X_train, method = "medianImpute")
X_ext_clean <- predict(pre_proc, X_ext)

y_ext <-  df_external$IR_label

# Replace "Non-IR" → "Non.IR"
df_external$IR_label <- gsub("^Non-IR$", "Non.IR", df_external$IR_label)

# Fix any spacing issues
df_external$IR_label <- trimws(df_external$IR_label)

# Convert to factor with correct levels
y_ext <- factor(df_external$IR_label)

pred_class_ext <- predict(model_rf, X_ext_clean)
cm_ext <- caret::confusionMatrix(pred_class_ext, y_ext)

library(ggplot2)
library(dplyr)
library(scales)

# ------------------------------
# 1. Convert confusionMatrix to tidy dataframe

total_n <- length(y_train) + length(y_test)

split_counts <- data.frame(
  Set = factor(c("Train", "Test"), levels = c("Train", "Test")),  # force order
  N = c(length(y_train), length(y_test))
) %>%
  mutate(Percent = N / total_n * 100)  # percentage

# ------------------------------
# 2. Plot
# ------------------------------
split_plot <- ggplot(split_counts, aes(x = Set, y = N, fill = Set)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = paste0(N, "\n(", round(Percent,1), "%)")), 
            vjust = -0.5, size = 5) +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Train vs Test Sample Sizes", y = "Number of Samples", x = "", face = "bold") +
  theme_minimal(base_size = 8) +
  theme(legend.position = "none") +
  theme(aspect.ratio=0.9)

# ------------------------------
cm_ext_df <- as.data.frame(cm_ext$table) %>%
  dplyr::rename(
    True = Reference,
    Predicted = Prediction,
    Count = Freq
  ) %>%
  group_by(True) %>%
  mutate(
    Percent = Count / sum(Count)  # proportion per true class
  ) %>%
  ungroup()

# ------------------------------
# 2. Plot confusion matrix heatmap
# ------------------------------
cm_ext_plot <- ggplot(cm_ext_df, aes(x = Predicted, y = True, fill = Percent)) +
  geom_tile(color = "white", size = 0.5) +
  geom_text(aes(label = paste0(Count, "\n(", round(Percent*100,1), "%)")), size = 6) +
  scale_fill_gradient(low = "#DCE6F1", high = "#2C7BB6", labels = percent_format()) +
  labs(
    title = "External Test Set Confusion Matrix",
    x = "Predicted Label",
    y = "True Label",
    fill = "Proportion"
  ) +
  theme_minimal(base_size = 8) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
    axis.text.y = element_text(face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.title = element_text(face = "bold"))

# ------------------------------
# 3. Display the plot
# ------------------------------
combined_plot <- split_plot + cm_ext_plot + plot_layout(ncol = 2, widths = c(2, 4))
combined_plot

# ------------------------------
# 3. Display
# ------------------------------

6.2 RF Performance Metrics

 metrics_ext <- data.frame(
  Accuracy = cm_ext$overall["Accuracy"],
  Sensitivity = cm_ext$byClass["Sensitivity"],
  Specificity = cm_ext$byClass["Specificity"],
  Balanced_Accuracy = (cm_ext$byClass["Sensitivity"] + cm_ext$byClass["Specificity"]) / 2,  # <-- new
  Precision = cm_ext$byClass["Pos Pred Value"],
  F1_Score = 2 * (cm_ext$byClass["Pos Pred Value"] * cm_ext$byClass["Sensitivity"]) /
    (cm_ext$byClass["Pos Pred Value"] + cm_ext$byClass["Sensitivity"])
)

library(knitr)
library(kableExtra)
library(dplyr)

# ------------------------------
# 1. Clean numeric metrics for display
# ------------------------------
metrics_ext_clean <- metrics_ext %>%
  # Convert all columns to numeric
  mutate(across(everything(), ~ as.numeric(.))) %>%
  # Round to 3 decimal places for readability
  mutate(across(everything(), ~ round(., 3))) %>%
  # Convert to long format for nice table
  tidyr::pivot_longer(
    cols = everything(),
    names_to = "Metric",
    values_to = "Value"
  )

# ------------------------------
# 2. Create HTML kable
# ------------------------------
kable(metrics_ext_clean, "html", caption = "Extended Model Performance Metrics") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = T,
    font_size = 14
  ) %>%
  row_spec(0, bold = TRUE, background = "#A5D8FF") %>%
  column_spec(2, bold = TRUE)

Extended Model Performance Metrics
Metric	Value
Accuracy	0.696
Sensitivity	0.577
Specificity	0.800
Balanced_Accuracy	0.688
Precision	0.714
F1_Score	0.638

 library(dplyr)
library(ggplot2)
library(gridExtra)

# ------------------------------
# 1. Train/Test counts (unchanged)
# ------------------------------