1 Background

Insulin resistance (IR)—a state in which the body requires more insulin to achieve normal glucose regulation is a major predictor of type 2 diabetes. Early detection is essential, yet standard blood test measures such as the oral glucose tolerance test, euglycemic–hyperinsulinemic clamp, or the HOMA-IR require are invasive, costly, and cannot be used for frequent or continuous assessment.

Digital biomarkers—physiological, lifestyle and behavioral data collected via wearables and mobile devices—may potentially offer a scalable, non-invasive alternative for early IR screening. We evaluated whether non-invasive digital biomarkers combining sleep, physical activity, nutrfactors, and anthopopometrics, and nutrition can predict HOMA-IR–based IR in adults with normoglycemia, prediabetes and type-2 diabetes.

Demonstrating intitial feasibility, random forest models, built on lifestyle data and anthropometrics, predicted IR out-of-sample (Study 1: balanced accuracy = 0.71, 95% CI: 0.69–0.73, p = .020; AUC = 0.87, 95% CI: 0.85–0.88, p = .020) and replicated in an independent test cohort (Study 2: N=56; balanced accuracy = 0.71, 95% CI: 0.69–0.73, p = .020; AUC = 0.87, 95% CI: 0.85–0.88, p = .020); with top predictors including [insert features]. Results remained robust across modeling specifications.

  • Together, findings highlight the potential of digital biomarkers for early, non-invasive detection of insulin resistance,futher triggering reventive lifestyle interventions delivered in daily life.

2 Data Preparation

We use the Flagship Dataset of Type 2 Diabetes from open source AI-READI Project.

  • Background: (https://aireadi.org)
  • Documentation: https://docs.aireadi.org
  • Version 2 of the AI‑READI “Flagship Dataset of Type-2 Diabetes” was made publicly available on November 8,2024.
  • AI-READI, funded by the U.S. National Institutes of Health through the Bridge2AI Program, aims to collect and share a multimodal, AI-ready dataset for studying salutogenesis in Type 2 Diabetes Mellitus.

2.1 PRISMA inclusion

  • We include adult participants across three clinical sites (UAB, UW, UCSD) ) who are normoglycemic, have prediabetes, or have type 2 diabetes, and who meet the following criteria:

    • Reported fasting (8hours or more) prior to glucose and insulin blood tests
    • Are not on insulin medication
    • Have available smartwatch data (Garmin venu)
    • Have available blood test results (fasting glucose & insulin) to be able to calculate HOMA-IR (Homeostatic Model Assessment of Insulin Resistance)


  • We further split participants by site:

    • UAB and UW for model development and testing (Study 1),
    • UCSD is held out/ serves as the external model validation cohort (Study 2).

Both cohorts follow the same inclusion criteria described above.

# ========================================
# STEP 1: Load datasets
# ========================================
hba1c <- read.delim("~/Downloads/participants.tsv", header = TRUE, stringsAsFactors = FALSE)
biom  <- read.csv("~/Downloads/biomarker_data.csv", header = TRUE, stringsAsFactors = FALSE)

df <- merge(hba1c, biom, by.x = "participant_id", by.y = "person_id")
n_start <- length(unique(df$participant_id))

# ========================================
# STEP 2: Process observation file for fasting
# ========================================
ob <- read.delim("~/Downloads/observation(in).csv", header = TRUE, stringsAsFactors = FALSE)

ob_split <- ob %>%
  separate(
    col = 1,
    into = c("observation_id","person_id","observation_concept_id",
             "observation_date","observation_datetime","observation_type_concept_id",
             "value_as_number","value_as_string","value_as_concept_id",
             "qualifier_concept_id","unit_concept_id","provider_id",
             "visit_occurrence_id","visit_detail_id","observation_source_value",
             "observation_source_concept_id","unit_source_value","value_source_value",
             "value_source_concept_id","some_extra1","some_extra2"),
    sep = ",", fill = "right"
  )

paate_rows <- ob_split %>%
  filter(grepl("paate", observation_source_value, ignore.case = TRUE)) %>%
  select(value_as_string, person_id)

paate_rows$value_as_string <- as.numeric(paate_rows$value_as_string)
paate_rows$person_id <- as.integer(paate_rows$person_id)

fasting_rows <- paate_rows %>% filter(value_as_string >= 8)

# ========================================
# STEP 3: Sequential filtering
# ========================================
# Step 1: Non-insulin-dependent
df_step1 <- df %>% filter(!study_group %in% "insulin_dependent")
n_step1 <- length(unique(df_step1$participant_id))
removed_step1 <- n_start - n_step1

# Step 2: Fasting 8+ hours
df_step2 <- df_step1 %>%
  left_join(fasting_rows, by = c("participant_id" = "person_id")) %>%
  filter(!is.na(value_as_string))
n_step2 <- length(unique(df_step2$participant_id))
removed_step2 <- n_step1 - n_step2

# Step 3: Exclude UCSD site
df_step3 <- df_step2 %>% filter(clinical_site != "UCSD")
n_step3 <- length(unique(df_step3$participant_id))
removed_step3 <- n_step2 - n_step3

# Step 4: Wearable data available
df_step4 <- df_step3 %>% filter(wearable_activity_monitor == TRUE)
n_step4 <- length(unique(df_step4$participant_id))
removed_step4 <- n_step3 - n_step4

# Step 5: Remove participants with missing Glucose OR INSULIN
df_step5 <- df_step4 %>% filter(!is.na(Glucose..mg.dL.) & !is.na(INSULIN..ng.mL.))
n_step5 <- length(unique(df_step5$participant_id))
removed_step5 <- n_step4 - n_step5

# Step 6: Compute HOMA_insulin
df_step5$HOMA_insulin <- ((df_step5$INSULIN..ng.mL. * 24) * df_step5$Glucose..mg.dL.) / 405

# Step 7: Remove HOMA outliers (>3 SD above median)
median_homa <- median(df_step5$HOMA_insulin, na.rm = TRUE)
sd_homa <- sd(df_step5$HOMA_insulin, na.rm = TRUE)

df_final <- df_step5 %>%
  filter(HOMA_insulin <= (median_homa + 3 * sd_homa))
n_final <- length(unique(df_final$participant_id))
removed_step6 <- n_step5 - n_final

# ========================================
# UCSD-only filtering (Stepwise, remove other sites explicitly)
# ========================================
df_ucsd <- df  # start with full dataset

# Step 1: Non-insulin-dependent
df_ucsd_step1 <- df_ucsd %>% filter(!study_group %in% "insulin_dependent")
n_step1_ucsd <- length(unique(df_ucsd_step1$participant_id))
removed_step1_ucsd <- n_start - n_step1_ucsd  # same starting n_start

# Step 2: Fasting 8+ hours
df_ucsd_step2 <- df_ucsd_step1 %>%
  left_join(fasting_rows, by = c("participant_id" = "person_id")) %>%
  filter(!is.na(value_as_string))
n_step2_ucsd <- length(unique(df_ucsd_step2$participant_id))
removed_step2_ucsd <- n_step1_ucsd - n_step2_ucsd

# Step 3: Keep only UCSD, remove participants from other two sites
df_ucsd_step3 <- df_ucsd_step2 %>% filter(clinical_site == "UCSD")
n_step3_ucsd <- length(unique(df_ucsd_step3$participant_id))
removed_step3_ucsd <- n_step2_ucsd - n_step3_ucsd

# Step 4: Wearable data available
df_ucsd_step4 <- df_ucsd_step3 %>% filter(wearable_activity_monitor == TRUE)
n_step4_ucsd <- length(unique(df_ucsd_step4$participant_id))
removed_step4_ucsd <- n_step3_ucsd - n_step4_ucsd

# Step 5: Remove participants with missing Glucose OR INSULIN
df_ucsd_step5 <- df_ucsd_step4 %>% filter(!is.na(Glucose..mg.dL.) & !is.na(INSULIN..ng.mL.))
n_step5_ucsd <- length(unique(df_ucsd_step5$participant_id))
removed_step5_ucsd <- n_step4_ucsd - n_step5_ucsd

# Step 6: Compute HOMA_insulin
df_ucsd_step5$HOMA_insulin <- ((df_ucsd_step5$INSULIN..ng.mL. * 24) * df_ucsd_step5$Glucose..mg.dL.) / 405

# Step 7: Remove HOMA outliers (>3 SD above median)
median_homa_ucsd <- median(df_ucsd_step5$HOMA_insulin, na.rm = TRUE)
sd_homa_ucsd <- sd(df_ucsd_step5$HOMA_insulin, na.rm = TRUE)

df_ucsd_final <- df_ucsd_step5 %>%
  filter(HOMA_insulin <= (median_homa_ucsd + 3 * sd_homa_ucsd))
n_final_ucsd <- length(unique(df_ucsd_final$participant_id))
removed_step6_ucsd <- n_step5_ucsd - n_final_ucsd

# ========================================
# Side-by-side PRISMA (original visuals)
# ========================================
grViz(glue('
digraph prisma_sidebyside {{
  rankdir = TB;
  graph [layout = dot, nodesep = 0.5, ranksep = 0.75]

  node [
    shape = box,
    style = "rounded,filled",
    color = "#4A4A4A",
    fontcolor = "black",
    fillcolor = "#E8F0FE",
    fontsize = 12,
    penwidth = 1.2,
    width = 3
  ]

  # Study 1: Model Development & Testing
  subgraph cluster_a {{
    label = "Study 1: Model Development & Testing"
    a1 [label="Starting dataset\\n(n = {n_start})"]
    b1 [label="Non–insulin-dependent\\n(n = {n_step1})\\n(removed {removed_step1})"]
    c1 [label="Fasting 8+ hours\\n(n = {n_step2})\\n(removed {removed_step2})"]
    d1 [label="Clinical site != UCSD\\n(n = {n_step3})\\n(removed {removed_step3})"]
    e1 [label="Have wearable data\\n(n = {n_step4})\\n(removed {removed_step4})"]
    f1 [label="Glucose OR INSULIN not missing\\n(n = {n_step5})\\n(removed {removed_step5})"]
    g1 [label="HOMA outliers removed\\n(n = {n_final})\\n(removed {removed_step6})"]
    a1 -> b1 -> c1 -> d1 -> e1 -> f1 -> g1
  }}

  # Study 2: External Validation
  subgraph cluster_b {{
    label = "Study 2: UCSD-only (External Validation)"
    a2 [label="Starting dataset\\n(n = {n_start})"]
    b2 [label="Non–insulin-dependent\\n(n = {n_step1_ucsd})\\n(removed {removed_step1_ucsd})"]
    c2 [label="Fasting 8+ hours\\n(n = {n_step2_ucsd})\\n(removed {removed_step2_ucsd})"]
    d2 [label="Only UCSD site\\n(n = {n_step3_ucsd})\\n(removed {removed_step3_ucsd})"]
    e2 [label="Have wearable data\\n(n = {n_step4_ucsd})\\n(removed {removed_step4_ucsd})"]
    f2 [label="Glucose OR INSULIN not missing\\n(n = {n_step5_ucsd})\\n(removed {removed_step5_ucsd})"]
    g2 [label="HOMA outliers removed\\n(n = {n_final_ucsd})\\n(removed {removed_step6_ucsd})"]
    a2 -> b2 -> c2 -> d2 -> e2 -> f2 -> g2
  }}
}}
'))

2.2 HOMA-IR Calculation

In the next data preparation section, we take four steps to construct and refine our insulin-resistance measures. First, we calculate a person-level continuous insulin resistance score using the HOMA-IR (Homeostatic Model Assessment for Insulin Resistance) equation, which provides a widely used physiological estimate of insulin resistance derived from fasting glucose and insulin. Second, we inspect and treat outliers to reduce the influence of implausible or extreme values (e.g., non fasting values). Third, we binarize the continuous HOMA-IR score into “insulin resistant” versus “non-resistant” categories, allowing for categorical comparisons and easier interpretability in later modeling. Finally, we descriptively compare our binary insulin-resistance classification to the distribution of HbA1c, which helps illustrate how the HOMA-IR–based categorization aligns with established clinically relevant glycemic marker.

2.2.2 2. Outlier Inspection

  • We removed values that fell +3 SD deviations from the median to account for improbable values (e.g. individuals who likely did not fast; HOMA-IS >35).

  • Aplied to 3 individuals in Study 1 and to 1 individual in Study 2.

# -----------------------------
before <- data.frame(HOMA = c(df_step5$HOMA_insulin)) # replace with real data
before_no_outliers <- before %>%
  filter(HOMA >= mean(HOMA, na.rm = TRUE) - 3*sd(HOMA, na.rm = TRUE),
         HOMA <= mean(HOMA, na.rm = TRUE) + 3*sd(HOMA, na.rm = TRUE))

# -----------------------------
# PANEL A: Density Before
# -----------------------------
mean_val_A   <- mean(before$HOMA, na.rm = TRUE)
median_val_A <- median(before$HOMA, na.rm = TRUE)
sd_val_A     <- sd(before$HOMA, na.rm = TRUE)
min_val_A    <- min(before$HOMA, na.rm = TRUE)
max_val_A    <- max(before$HOMA, na.rm = TRUE)

dens_A <- density(before$HOMA, na.rm = TRUE)
ymax_plot_A <- max(dens_A$y)*1.05
legend_x_A <- min_val_A + 0.3*(max_val_A - min_val_A)
symbol_y_A <- seq(ymax_plot_A - 0.08*ymax_plot_A, ymax_plot_A - 0.22*ymax_plot_A, length.out = 3)
legend_y_top_A <- ymax_plot_A - 0.02*ymax_plot_A
legend_df_A <- data.frame(
  x = rep(legend_x_A, 3),
  y = symbol_y_A,
  label = c("Mean","Median","SD"),
  text = c(paste0("Mean: ", round(mean_val_A,2)),
           paste0("Median: ", round(median_val_A,2)),
           paste0("SD: ", round(sd_val_A,2)))
)
min_max_text_A <- paste0("Min: ", round(min_val_A,2), " | Max: ", round(max_val_A,2))

p_density_A <- ggplot(before, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_A, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_A, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_A + sd_val_A, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_A - sd_val_A, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_A, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_A, aes(x=x + 0.02*(max(before$HOMA)-min(before$HOMA)),
                                  y=y, label=text),
            hjust=0, color="black", size=3) +  # slightly smaller text
  annotate("text", x=legend_x_A, y=legend_y_top_A, label=min_max_text_A,
           hjust=0, color="black", size=3) +
  scale_color_manual(values=c("Mean"="#00468B","Median"="darkred","SD"="gray")) +
  labs(title="A: Including Outliers", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_A) +
  theme(aspect.ratio=0.9)

# -----------------------------
# PANEL B: Density After
# -----------------------------
mean_val_B   <- mean(before_no_outliers$HOMA, na.rm = TRUE)
median_val_B <- median(before_no_outliers$HOMA, na.rm = TRUE)
sd_val_B     <- sd(before_no_outliers$HOMA, na.rm = TRUE)
min_val_B    <- min(before_no_outliers$HOMA, na.rm = TRUE)
max_val_B    <- max(before_no_outliers$HOMA, na.rm = TRUE)

dens_B <- density(before_no_outliers$HOMA, na.rm = TRUE)
ymax_plot_B <- max(dens_B$y)*1.05
legend_x_B <- min_val_B + 0.3*(max_val_B - min_val_B)
symbol_y_B <- seq(ymax_plot_B - 0.08*ymax_plot_B, ymax_plot_B - 0.22*ymax_plot_B, length.out = 3)
legend_y_top_B <- ymax_plot_B - 0.02*ymax_plot_B
legend_df_B <- data.frame(
  x=rep(legend_x_B,3),
  y=symbol_y_B,
  label=c("Mean","Median","SD"),
  text=c(paste0("Mean: ", round(mean_val_B,2)),
         paste0("Median: ", round(median_val_B,2)),
         paste0("SD: ", round(sd_val_B,2)))
)
min_max_text_B <- paste0("Min: ", round(min_val_B,2), " | Max: ", round(max_val_B,2))

p_density_B <- ggplot(before_no_outliers, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_B, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_B, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_B + sd_val_B, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_B - sd_val_B, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_B, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_B, aes(x=x + 0.02*(max(before_no_outliers$HOMA)-min(before_no_outliers$HOMA)),
                                  y=y, label=text),
            hjust=0, color="black", size=3) +  # slightly smaller text
  annotate("text", x=legend_x_B, y=legend_y_top_B, label=min_max_text_B,
           hjust=0, color="black", size=3) +
    scale_color_manual(values=c("Mean"="#00468B","Median"="darkred","SD"="gray")) +
  labs(title="B: Post Outlier Removal", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_B) +
  theme(aspect.ratio=0.9)


# -----------------------------
# PANEL C: Boxplot Before
# -----------------------------
line_3sd <- median(before$HOMA, na.rm = TRUE) + 3*sd(before$HOMA, na.rm = TRUE)

# Add a color column: dark red if > median + 3 SD, else default
before <- before %>%
  mutate(color_flag = ifelse(HOMA > line_3sd, "above_3SD", "normal"))

# Plot
p_A <- ggplot(before, aes(x = "", y = HOMA)) +
  geom_boxplot(width = 0.4, fill = "#A5D8FF", alpha = 0.7, outlier.shape = NA) +
  geom_jitter(aes(color = color_flag), width = 0.1, size = 2, alpha = 0.6) +
  scale_color_manual(values = c("normal" = "#1B6CA8", "above_3SD" = "darkred")) +
  geom_hline(yintercept = line_3sd, linetype = "dashed", color = "gray", size = 0.7) +
  annotate("text", x = 1.2, y = line_3sd + 2, 
           label = paste0("Med. + 3SD = ", round(line_3sd,2)),
           hjust = 0, color = "gray20", size = 3) +
  labs(title = "C: Including Outliers", x = "", y = "HOMA-Insulin") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0, size = 12),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none") +
  theme(aspect.ratio = 0.9)

# -----------------------------
# PANEL D: Boxplot After
# -----------------------------
p_B <- ggplot(before_no_outliers, aes(x="", y=HOMA)) +
  geom_boxplot(width=0.4, fill="#A5D8FF", alpha=0.7, outlier.shape=NA) +
  geom_jitter(width=0.1, size=2, color="#1B6CA8", alpha=0.4) +
  labs(title="D: Post Outlier Removal", x="", y="HOMA-Insulin") +
  theme_classic(base_size=10) +
  theme(plot.title=element_text(face="bold", hjust=0, size=12),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(aspect.ratio=0.9)

# -----------------------------
# Combine 4 plots in 2x2 grid
# -----------------------------
plot_grid(p_density_A, p_density_B,
          p_A, p_B,
          ncol=2, align="hv")

2.2.3 3. Categorical HOMA-IR (DV)

  • To binarize HOMA-IR, we used > 2.5 as a cut-off (based on NHANES and U.S. Adult Studies).
# -----------------------------
# Prepare data
# -----------------------------
before <- data.frame(HOMA = df_ucsd_step5$HOMA_insulin)
before_no_outliers <- before %>%
  filter(HOMA >= mean(HOMA, na.rm = TRUE) - 3*sd(HOMA, na.rm = TRUE),
         HOMA <= mean(HOMA, na.rm = TRUE) + 3*sd(HOMA, na.rm = TRUE))

# -----------------------------
# PANEL A: Density Before
# -----------------------------
mean_val_A   <- mean(before$HOMA, na.rm = TRUE)
median_val_A <- median(before$HOMA, na.rm = TRUE)
sd_val_A     <- sd(before$HOMA, na.rm = TRUE)
min_val_A    <- min(before$HOMA, na.rm = TRUE)
max_val_A    <- max(before$HOMA, na.rm = TRUE)

dens_A <- density(before$HOMA, na.rm = TRUE)
ymax_plot_A <- max(dens_A$y)*1.05
legend_x_A <- min_val_A + 0.3*(max_val_A - min_val_A)
symbol_y_A <- seq(ymax_plot_A - 0.08*ymax_plot_A, ymax_plot_A - 0.22*ymax_plot_A, length.out = 3)
legend_y_top_A <- ymax_plot_A - 0.02*ymax_plot_A
legend_df_A <- data.frame(
  x = rep(legend_x_A, 3),
  y = symbol_y_A,
  label = c("Mean","Median","SD"),
  text = c(paste0("Mean: ", round(mean_val_A,2)),
           paste0("Median: ", round(median_val_A,2)),
           paste0("SD: ", round(sd_val_A,2)))
)
min_max_text_A <- paste0("Min: ", round(min_val_A,2), " | Max: ", round(max_val_A,2))

p_density_A <- ggplot(before, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_A, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_A, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_A + sd_val_A, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_A - sd_val_A, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_A, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_A, aes(x=x + 0.02*(max(before$HOMA)-min(before$HOMA)), y=y, label=text),
            hjust=0, color="black", size=3) +
  annotate("text", x=legend_x_A, y=legend_y_top_A, label=min_max_text_A,
           hjust=0, color="black", size=3) +
  scale_color_manual(values=c("Mean"="darkred","Median"="#005b96","SD"="#011f4b")) +
  labs(title="A: Including Outliers", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_A) +
  theme(aspect.ratio=0.9)

# -----------------------------
# PANEL B: Density After
# -----------------------------
mean_val_B   <- mean(before_no_outliers$HOMA, na.rm = TRUE)
median_val_B <- median(before_no_outliers$HOMA, na.rm = TRUE)
sd_val_B     <- sd(before_no_outliers$HOMA, na.rm = TRUE)
min_val_B    <- min(before_no_outliers$HOMA, na.rm = TRUE)
max_val_B    <- max(before_no_outliers$HOMA, na.rm = TRUE)

dens_B <- density(before_no_outliers$HOMA, na.rm = TRUE)
ymax_plot_B <- max(dens_B$y)*1.05
legend_x_B <- min_val_B + 0.3*(max_val_B - min_val_B)
symbol_y_B <- seq(ymax_plot_B - 0.08*ymax_plot_B, ymax_plot_B - 0.22*ymax_plot_B, length.out = 3)
legend_y_top_B <- ymax_plot_B - 0.02*ymax_plot_B
legend_df_B <- data.frame(
  x=rep(legend_x_B,3),
  y=symbol_y_B,
  label=c("Mean","Median","SD"),
  text=c(paste0("Mean: ", round(mean_val_B,2)),
         paste0("Median: ", round(median_val_B,2)),
         paste0("SD: ", round(sd_val_B,2)))
)
min_max_text_B <- paste0("Min: ", round(min_val_B,2), " | Max: ", round(max_val_B,2))

p_density_B <- ggplot(before_no_outliers, aes(HOMA)) +
  geom_density(fill="#A5D8FF", alpha=0.5, color="#1B6CA8", linewidth=1.1) +
  geom_vline(aes(xintercept=mean_val_B, color="Mean"), linewidth=1) +
  geom_vline(aes(xintercept=median_val_B, color="Median"), linewidth=1) +
  geom_vline(aes(xintercept=mean_val_B + sd_val_B, color="SD"), linetype="dotted") +
  geom_vline(aes(xintercept=mean_val_B - sd_val_B, color="SD"), linetype="dotted") +
  geom_point(data=legend_df_B, aes(x=x, y=y, color=label), size=1.5) +
  geom_text(data=legend_df_B, aes(x=x + 0.02*(max(before_no_outliers$HOMA)-min(before_no_outliers$HOMA)),
                                  y=y, label=text),
            hjust=0, color="black", size=3) +
  annotate("text", x=legend_x_B, y=legend_y_top_B, label=min_max_text_B,
           hjust=0, color="black", size=3) +
  scale_color_manual(values=c("Mean"="darkred","Median"="#005b96","SD"="#011f4b")) +
  labs(title="B: Post Outlier Removal", x="HOMA-Insulin", y="Density") +
  theme_classic(base_size=10) +
  theme(legend.position="none",
        plot.title = element_text(face="bold", hjust=0, size=12)) +
  ylim(0, ymax_plot_B) +
  theme(aspect.ratio=0.9)

# -----------------------------
# PANEL C: Boxplot Before
# -----------------------------
line_3sd <- median(before$HOMA, na.rm = TRUE) + 3*sd(before$HOMA, na.rm = TRUE)

before <- before %>%
  mutate(color_flag = ifelse(HOMA > line_3sd, "above_3SD", "normal"))

p_A <- ggplot(before, aes(x = "", y = HOMA)) +
  geom_boxplot(width = 0.4, fill = "#A5D8FF", alpha = 0.7, outlier.shape = NA) +
  geom_jitter(aes(color = color_flag), width = 0.1, size = 2, alpha = 0.6) +
  scale_color_manual(values = c("normal" = "#1B6CA8", "above_3SD" = "darkred")) +
  geom_hline(yintercept = line_3sd, linetype = "dashed", color = "gray", size = 0.7) +
  annotate("text", x = 1.2, y = line_3sd + 2, 
           label = paste0("Med. + 3SD = ", round(line_3sd,2)),
           hjust = 0, color = "gray20", size = 3) +
  labs(title = "C: Including Outliers", x = "", y = "HOMA-Insulin") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0, size = 12),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none") +
  theme(aspect.ratio = 0.9)

# -----------------------------
# PANEL D: Boxplot After
# -----------------------------
p_B <- ggplot(before_no_outliers, aes(x="", y=HOMA)) +
  geom_boxplot(width=0.4, fill="#A5D8FF", alpha=0.7, outlier.shape=NA) +
  geom_jitter(width=0.1, size=2, color="#1B6CA8", alpha=0.4) +
  labs(title="D: Post Outlier Removal", x="", y="HOMA-Insulin") +
  theme_classic(base_size=10) +
  theme(plot.title=element_text(face="bold", hjust=0, size=12),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(aspect.ratio=0.9)

# -----------------------------
# Combine 4 plots in 2x2 grid
# -----------------------------
# --- Top row: density plots ---
top_row <- cowplot::plot_grid(p_density_A, p_density_B, ncol = 2, labels = c("A", "B"))
# --- Bottom row: HOMA plots ---
bottom_row <- cowplot::plot_grid(p_A, p_B, ncol = 2, labels = c("C", "D"))
# --- Combine top and bottom rows vertically ---
combined_plot <- cowplot::plot_grid(top_row, bottom_row, ncol = 1, align = "v")
combined_plot

2.2.4 4. HOMA-IR- HbA1c comparison

# ============================================================
# Load libraries
# ============================================================
# ============================================================
# Shared Colors
# ============================================================
colors_homa <- c("Non-IR" = "#1B6CA8", "IR" = "#953553")

colors_hba1c <- c(
  "Normoglycemic" = "#1B6CA8",
  "Prediabetic"   = "#E69F00",
  "T2 Diabetic"   = "#953553"
)

# ============================================================
# -------------------- Study 1: HOMA-IR -----------------------
# ============================================================
df_f <- df_final %>%
  mutate(
    IR_status = ifelse(HOMA_insulin >= 2.5, 1, 0),
    IR_label  = ifelse(IR_status == 1, "IR", "Non-IR")
  )

counts_homa <- df_f %>% group_by(IR_label) %>% summarise(n = n())
legend_labels_homa <- paste0(counts_homa$IR_label, ": ", counts_homa$n)
names(legend_labels_homa) <- counts_homa$IR_label

p_homa <- ggplot(df_f, aes(x = "", y = HOMA_insulin, color = IR_label)) +
  geom_jitter(width = 0.1, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 2.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 1: HOMA-IR ≥ 2.5", y = "HOMA-IR", x = "", color = "IR Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 0.9) +
  scale_color_manual(values = colors_homa, labels = legend_labels_homa)

# ============================================================
# -------------------- Study 2: HOMA-IR -----------------------
# ============================================================
df_s2 <- df_ucsd_final %>%
  mutate(
    IR_status = ifelse(HOMA_insulin >= 2.5, 1, 0),
    IR_label  = ifelse(IR_status == 1, "IR", "Non-IR")
  )

counts_homa_s2 <- df_s2 %>% group_by(IR_label) %>% summarise(n = n())
legend_labels_homa_s2 <- paste0(counts_homa_s2$IR_label, ": ", counts_homa_s2$n)
names(legend_labels_homa_s2) <- counts_homa_s2$IR_label

p_homa_s2 <- ggplot(df_s2, aes(x = "", y = HOMA_insulin, color = IR_label)) +
  geom_jitter(width = 0.1, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 2.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 2: HOMA-IR ≥ 2.5", y = "HOMA-IR", x = "", color = "IR Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 0.9) +
  scale_color_manual(values = colors_homa, labels = legend_labels_homa_s2)

# ============================================================
# -------------------- Study 1: HbA1c -------------------------
# ============================================================
df_final <- df_final %>%
  mutate(
    HbA1c_value = HbA1c....,
    Glycemic_label = case_when(
      HbA1c_value < 5.7 ~ "Normoglycemic",
      HbA1c_value >= 5.7 & HbA1c_value < 6.5 ~ "Prediabetic",
      HbA1c_value >= 6.5 ~ "T2 Diabetic"
    )
  )

counts_hba1c <- df_final %>% group_by(Glycemic_label) %>% summarise(n = n())
legend_labels_hba1c <- paste0(counts_hba1c$Glycemic_label, ": ", counts_hba1c$n)
names(legend_labels_hba1c) <- counts_hba1c$Glycemic_label

p_hba1c <- ggplot(df_final, aes(x = "", y = HbA1c_value, color = Glycemic_label)) +
  geom_jitter(width = 0.12, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 5.7, linetype = "dashed", color = "gray40", size = 1) +
  geom_hline(yintercept = 6.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 1: HbA1c Classification", y = "HbA1c (%)", x = "", color = "Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 1) +
  scale_color_manual(values = colors_hba1c, labels = legend_labels_hba1c)

# ============================================================
# -------------------- Study 2: HbA1c -------------------------
# ============================================================
df_s2 <- df_ucsd_final %>%
  mutate(
    HbA1c_value = HbA1c....,
    Glycemic_label = case_when(
      HbA1c_value < 5.7 ~ "Normoglycemic",
      HbA1c_value >= 5.7 & HbA1c_value < 6.5 ~ "Prediabetic",
      HbA1c_value >= 6.5 ~ "T2 Diabetic"
    )
  )

counts_hba1c_s2 <- df_s2 %>% group_by(Glycemic_label) %>% summarise(n = n())
legend_labels_hba1c_s2 <- paste0(counts_hba1c_s2$Glycemic_label, ": ", counts_hba1c_s2$n)
names(legend_labels_hba1c_s2) <- counts_hba1c_s2$Glycemic_label

p_hba1c_s2 <- ggplot(df_s2, aes(x = "", y = HbA1c_value, color = Glycemic_label)) +
  geom_jitter(width = 0.12, size = 2, alpha = 0.8) +
  geom_hline(yintercept = 5.7, linetype = "dashed", color = "gray40", size = 1) +
  geom_hline(yintercept = 6.5, linetype = "dashed", color = "gray40", size = 1) +
  labs(title = "Study 2: HbA1c Classification", y = "HbA1c (%)", x = "", color = "Status") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        aspect.ratio = 1) +
  scale_color_manual(values = colors_hba1c, labels = legend_labels_hba1c_s2)

# ============================================================
# -------------------- Combine All Plots ----------------------
# ============================================================

# Row 1: HOMA-IR (Study 1 vs Study 2)
row1 <- p_homa | p_homa_s2

# Row 2: HbA1c (Study 1 vs Study 2)
row2 <- p_hba1c | p_hba1c_s2

# Full figure
full_plot <- row1 / row2

full_plot

# ============================================================
# Function to summarize HbA1c per IR group with range
# ============================================================
summarize_hba1c_IR <- function(df, study_name) {
  
  df <- df %>%
    mutate(
      HbA1c_value = as.numeric(HbA1c....),
      IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR")
    )
  
  na_count <- sum(is.na(df$HbA1c_value))
  df_clean <- df %>% filter(!is.na(HbA1c_value))
  
  descriptives <- df_clean %>%
    group_by(IR_label) %>%
    summarise(
      N = n(),
      Mean = round(mean(HbA1c_value), 2),
      SD = round(sd(HbA1c_value), 2),
      Median = round(median(HbA1c_value), 2),
      Range = paste0(round(min(HbA1c_value),2), "–", round(max(HbA1c_value),2)),
      .groups = "drop"
    ) %>%
    mutate(Study = study_name,
           N_Missing = na_count) %>%
    select(Study, IR_label, N, N_Missing, Mean, SD, Median, Range)
  
  # T-test (only if 2 groups)
  if(length(unique(df_clean$IR_label)) == 2){
    t_res <- t.test(HbA1c_value ~ IR_label, data = df_clean)
    t_test <- tibble(
      Study = study_name,
      t = round(t_res$statistic, 2),
      df = round(t_res$parameter, 1),
      p = signif(t_res$p.value, 3)
    )
  } else {
    t_test <- tibble(
      Study = study_name,
      t = NA, df = NA, p = NA
    )
  }
  
  list(descriptives = descriptives, t_test = t_test)
}

# ============================================================
# Summarize studies
# ============================================================
summary_s1 <- summarize_hba1c_IR(df_final, "Study 1")
summary_s2 <- summarize_hba1c_IR(df_ucsd_final, "Study 2")

descriptives_table <- bind_rows(summary_s1$descriptives, summary_s2$descriptives)
t_tests_table <- bind_rows(summary_s1$t_test, summary_s2$t_test)

# ============================================================
# Merge for display, only show t-test once per study
# ============================================================
final_table <- descriptives_table %>%
  left_join(t_tests_table, by = "Study") %>%
  arrange(Study, IR_label)

# Remove t-test values for second row of each study to only display once
final_table <- final_table %>%
  group_by(Study) %>%
  mutate(
    t = ifelse(row_number() == 1, t, NA),
    df = ifelse(row_number() == 1, df, NA),
    p = ifelse(row_number() == 1, p, NA)
  ) %>%
  ungroup()

# ============================================================
# Display as HTML kable
# ============================================================
kable(final_table, "html", caption = "HbA1c Summary by IR Status") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = T, font_size = 14) %>%
  row_spec(0, bold = TRUE, background = "#F5FBFF") %>%
  collapse_rows(columns = 1, valign = "top") %>%
  add_header_above(c(" " = 2, "Descriptive Statistics" = 6, "T-Test (Non-IR vs IR)" = 3))
HbA1c Summary by IR Status
Descriptive Statistics
T-Test (Non-IR vs IR)
Study IR_label N N_Missing Mean SD Median Range t df p
Study 1 IR 61 1 6.35 1.07 6.1 4.5–10.7 4.71 86.8 9.3e-06
Non-IR 76 1 5.63 0.57 5.5 4.6–7.4 NA NA NA
Study 2 IR 26 0 6.22 0.67 6.1 5.2–7.7 3.21 41.2 2.6e-03
Non-IR 30 0 5.73 0.43 5.7 4.9–6.6 NA NA NA

3 Descriptives

3.1 Clinical Visit

3.1.1 Study 1

# ============================================================
# 1. Create IR label
# ============================================================
df_final <- df_final %>%
  mutate(IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR"))

# ============================================================
# 2. Original variables to plot
# ============================================================
vars_to_plot <- c(
  "age",
  "weight_vsorres..Weight..kilograms.",
  "bmi_vsorres..BMI",
  "hip_vsorres..Hip.Circumference..cm.",
  "waist_vsorres..Waist.Circumference..cm.",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR.",
  "pulse_vsorres..Heart.Rate..bpm.",
  "bp1_diabp_vsorres..Diastolic..mmHg.",
  "bp2_diabp_vsorres..Diastolic..mmHg.",
  "bp1_sysbp_vsorres..Systolic..mmHg.",
  "bp2_sysbp_vsorres..Systolic..mmHg.",
  "clinical_site"
)

# Keep only existing columns
vars_to_plot <- vars_to_plot[vars_to_plot %in% names(df_final)]

# ============================================================
# 3. Short names for titles
# ============================================================
short_names <- c(
  "bmi_vsorres..BMI"                         = "BMI",
  "bp1_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP1",
  "bp2_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP2",
  "bp1_sysbp_vsorres..Systolic..mmHg."     = "Systolic_BP1",
  "bp2_sysbp_vsorres..Systolic..mmHg."     = "Systolic_BP2",
  "pulse_vsorres..Heart.Rate..bpm."         = "PulseHR",
  "waist_vsorres..Waist.Circumference..cm." = "Waist_Circumference",
  "hip_vsorres..Hip.Circumference..cm."     = "Hip_Circumference",
  "weight_vsorres..Weight..kilograms."      = "Weight (kg)",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR."   = "WaistHip_Ratio",
  "age"                                     = "Age",
  "clinical_site"                           = "Clinical_Site"
)

# Only keep short names for variables present
short_names <- short_names[names(short_names) %in% vars_to_plot]

# ============================================================
# 4. Colors
# ============================================================
fill_colors <- c("Non-IR" = "#A5D8FF", "IR" = "#FFB6C1")
edge_colors <- c("Non-IR" = "#1B6CA8", "IR" = "#C71585")

# ============================================================
# 5. Plotting function with p-value on the right
# ============================================================
plot_ir_variable <- function(var_name, data, short_names, fill_colors, edge_colors) {
  
  if (!var_name %in% names(data)) return(NULL)
  
  plot_data <- data %>% filter(!is.na(.data[[var_name]]))
  if (nrow(plot_data) == 0) return(NULL)
  
  plot_title <- ifelse(var_name %in% names(short_names), short_names[[var_name]], var_name)
  x <- plot_data[[var_name]]
  
  # Compute p-value
  p_val <- if (is.numeric(x)) {
    tryCatch(t.test(x ~ IR_label, data = plot_data)$p.value, error = function(e) NA)
  } else {
    tbl <- table(x, plot_data$IR_label)
    tryCatch(chisq.test(tbl)$p.value, error = function(e) NA)
  }
  
  # Format p-value as text
  p_text <- if (!is.na(p_val)) {
    if (p_val < 0.001) "p < 0.001" else paste0("p = ", signif(p_val, 3))
  } else {
    "p = NA"
  }
  
  # Numeric plot
  if (is.numeric(x)) {
    max_y <- max(x, na.rm = TRUE)
    min_y <- min(x, na.rm = TRUE)
    
    p <- ggplot(plot_data, aes(x = IR_label, y = .data[[var_name]], fill = IR_label, color = IR_label)) +
      geom_violin(alpha = 0.5, size = 0.8) +
      geom_jitter(width = 0.1, size = 1.5, alpha = 0.8) +
      scale_fill_manual(values = fill_colors) +
      scale_color_manual(values = edge_colors) +
      labs(title = plot_title, x = "IR Status", y = plot_title) +
      theme_classic(base_size = 9) +
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),    axis.text.y = element_text(face = "bold"),
            legend.position = "none",
            aspect.ratio = 1.3)
    
  } else { # Categorical plot
    p <- ggplot(plot_data, aes(x = .data[[var_name]], fill = IR_label)) +
      geom_bar(position = "dodge", color = "black") +
      scale_fill_manual(values = fill_colors) +
      labs(title = plot_title, x = plot_title, y = "Count") +
      theme_classic(base_size = 9) +
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
            axis.text.x = element_text(angle = 45, hjust = 1),
            axis.text.y = element_text(face = "bold"),
            legend.position = "none",
            aspect.ratio = 1.3)
  }
  
  # Add p-value text annotation to the right
  # p <- p + annotate("text", x = Inf, y = Inf, label = p_text,
  #                   hjust = 1, vjust = 2.5, size = 3, fontface = "italic")
  
  return(p)
  
  # x = Inf, y = Inf, 
}

# ============================================================
# 6. Generate plots
# ============================================================
plot_list <- lapply(
  vars_to_plot,
  plot_ir_variable,
  data = df_final,
  short_names = short_names,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list <- plot_list[!sapply(plot_list, is.null)]

# ============================================================
# 7. Combine into grid (4 columns)
# ============================================================
big_panel <- wrap_plots(plot_list, ncol = 4)
big_panel

# Separate numeric and categorical variables

# ============================
# Identify numeric vs categorical variables
# ============================
num_vars <- vars_to_plot[sapply(df_final[vars_to_plot], is.numeric)]
cat_vars <- vars_to_plot[sapply(df_final[vars_to_plot], function(x) !is.numeric(x))]

# ============================
# Numeric summary
# ============================
num_summary <- lapply(num_vars, function(var) {
  data <- df_final %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  # t-test
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  # format p-value
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = ifelse(length(IR_vals) > 0,
                sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)), "–"),
    Non_IR = ifelse(length(NonIR_vals) > 0,
                    sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)), "–"),
    Total = ifelse(length(Total_vals) > 0,
                   sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)), "–"),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ============================
# Categorical summary
# ============================
cat_summary <- lapply(cat_vars, function(var) {
  
  tbl <- table(df_final[[var]], df_final$IR_label)
  
  # Safe extraction of columns
  IR_col <- if ("IR" %in% colnames(tbl)) tbl[, "IR"] else rep(0, nrow(tbl))
  NonIR_col <- if ("Non-IR" %in% colnames(tbl)) tbl[, "Non-IR"] else rep(0, nrow(tbl))
  
  IR_total <- sum(IR_col)
  NonIR_total <- sum(NonIR_col)
  total_total <- sum(tbl)
  
  # chi-square test
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  # calculate percentages per column (so IR and Non-IR each sum to 100%)
  IR_str <- sprintf("%d (%.1f%%)", IR_col, 100 * IR_col / IR_total)
  NonIR_str <- sprintf("%d (%.1f%%)", NonIR_col, 100 * NonIR_col / NonIR_total)
  Total_str <- sprintf("%d (%.1f%%)", IR_col + NonIR_col, 100 * (IR_col + NonIR_col) / total_total)
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = paste(IR_str, collapse = "; "),
    Non_IR = paste(NonIR_str, collapse = "; "),
    Total = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ============================
# Combine tables
# ============================
final_table <- bind_rows(num_summary, cat_summary)

# ============================
# Add APA description row
# ============================
description_row <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD OR count (%)",
  Non_IR = "Non-IR = mean ± SD OR count (%)",
  Total = "Total = mean ± SD OR count (%)",
  `p-value` = "t-test or χ²"
)

final_table2 <- bind_rows(description_row, final_table)

# ============================
# Add significance stars + bold
# ============================
add_stars <- function(p) {
  p_num <- suppressWarnings(as.numeric(p))
  
  # Handle "<0.001"
  if (grepl("<0.001", p)) return("**<0.001** ***")
  
  # Handle non-numeric entries (e.g., "–")
  if (is.na(p_num)) return(p)
  
  # Assign stars
  if (p_num < 0.001) return("**<0.001** ***")
  if (p_num < 0.01)   return(paste0("**", p, "**", " **"))
  if (p_num < 0.05)   return(paste0("**", p, "**", " *"))
  
  return(p)
}

final_table2$`p-value` <- sapply(final_table2$`p-value`, add_stars)

# ============================
# Identify significant rows
# ============================
p_char <- final_table$`p-value`
# Remove "<" and convert to numeric
p_numeric <- as.numeric(gsub("<", "", p_char))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1  # +1 for description row

# ============================
# Render APA-style table
# ============================
kbl(final_table2, "html",
    caption = "Table 1. Comparison of clinical visit variables by IR status",  escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%   # header row
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%  # description row
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows, background = "#F5FBFF")
Table 1. Comparison of clinical visit variables by IR status
Variable IR Non_IR Total p-value
Note: IR = mean ± SD OR count (%) Non-IR = mean ± SD OR count (%) Total = mean ± SD OR count (%) t-test or χ²
Age 55.42 ± 9.13 56.62 ± 11.16 56.08 ± 10.28 0.489
Weight (kg) 99.21 ± 22.99 80.34 ± 25.54 88.82 ± 26.10 <0.001 ***
BMI 35.43 ± 8.08 29.31 ± 7.23 32.06 ± 8.19 <0.001 ***
Hip_Circumference 120.76 ± 16.65 109.22 ± 16.58 114.40 ± 17.52 <0.001 ***
Waist_Circumference 112.47 ± 14.91 95.57 ± 15.07 103.16 ± 17.16 <0.001 ***
WaistHip_Ratio 0.94 ± 0.08 0.88 ± 0.08 0.90 ± 0.09 <0.001 ***
PulseHR 72.65 ± 10.65 70.33 ± 11.62 71.37 ± 11.21 0.225
Diastolic_BP1 82.90 ± 9.92 78.05 ± 8.99 80.23 ± 9.69 0.003 **
Diastolic_BP2 79.82 ± 8.68 78.55 ± 9.57 79.12 ± 9.17 0.416
Systolic_BP1 133.00 ± 14.50 128.36 ± 14.42 130.44 ± 14.59 0.063
Systolic_BP2 130.85 ± 13.79 126.37 ± 15.87 128.38 ± 15.09 0.078
Clinical_Site 37 (59.7%); 25 (40.3%) 42 (55.3%); 34 (44.7%) 79 (57.2%); 59 (42.8%) 0.728

3.1.2 Study 2

# ============================================================
# 1. Create IR label
# ============================================================
df_ucsd_final <- df_ucsd_final %>%
  mutate(IR_label = ifelse(HOMA_insulin >= 2.5, "IR", "Non-IR"),
    clinical_site = factor(clinical_site))  # ensure proper categorical plotting

# ============================================================
# 2. Original variables to plot
# ============================================================
vars_to_plot <- c(
  "age",
  "weight_vsorres..Weight..kilograms.",
  "bmi_vsorres..BMI",
  "hip_vsorres..Hip.Circumference..cm.",
  "waist_vsorres..Waist.Circumference..cm.",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR.",
  "pulse_vsorres..Heart.Rate..bpm.",
  "bp1_diabp_vsorres..Diastolic..mmHg.",
  "bp2_diabp_vsorres..Diastolic..mmHg.",
  "bp1_sysbp_vsorres..Systolic..mmHg.",
  "bp2_sysbp_vsorres..Systolic..mmHg.",
  "clinical_site"
)

vars_to_plot <- vars_to_plot[vars_to_plot %in% names(df_ucsd_final)]

# ============================================================
# 3. Short names for titles
# ============================================================
short_names <- c(
  "bmi_vsorres..BMI"                         = "BMI",
  "bp1_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP1",
  "bp2_diabp_vsorres..Diastolic..mmHg."     = "Diastolic_BP2",
  "bp1_sysbp_vsorres..Systolic..mmHg."      = "Systolic_BP1",
  "bp2_sysbp_vsorres..Systolic..mmHg."      = "Systolic_BP2",
  "pulse_vsorres..Heart.Rate..bpm."         = "PulseHR",
  "waist_vsorres..Waist.Circumference..cm." = "Waist_Circumference",
  "hip_vsorres..Hip.Circumference..cm."     = "Hip_Circumference",
  "weight_vsorres..Weight..kilograms."      = "Weight (kg)",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR."   = "WaistHip_Ratio",
  "age"                                     = "Age",
  "clinical_site"                           = "Clinical_Site"
)

short_names <- short_names[names(short_names) %in% vars_to_plot]

# ============================================================
# 4. Colors
# ============================================================
fill_colors <- c("Non-IR" = "#A5D8FF", "IR" = "#FFB6C1")
edge_colors <- c("Non-IR" = "#1B6CA8", "IR" = "#C71585")

# ============================================================
# 5. Plotting function with p-value on the right
# ============================================================
plot_ir_variable <- function(var_name, data, short_names, fill_colors, edge_colors) {
  
  if (!var_name %in% names(data)) return(NULL)
  
  plot_data <- data %>% filter(!is.na(.data[[var_name]]))
  if (nrow(plot_data) == 0) return(NULL)
  
  plot_title <- ifelse(var_name %in% names(short_names), short_names[[var_name]], var_name)
  x <- plot_data[[var_name]]
  
  # Compute p-value
  p_val <- if (is.numeric(x)) {
    tryCatch(t.test(x ~ IR_label, data = plot_data)$p.value, error = function(e) NA)
  } else {
    tbl <- table(x, plot_data$IR_label)
    tryCatch(chisq.test(tbl)$p.value, error = function(e) NA)
  }
  
  # Format p-value as text
  p_text <- if (!is.na(p_val)) {
    if (p_val < 0.001) "p < 0.001" else paste0("p = ", signif(p_val, 3))
  } else {
    "p = NA"
  }
  
  if (is.numeric(x)) {
    # Numeric variables: violin + jitter
    p <- ggplot(plot_data, aes(x = IR_label, y = .data[[var_name]], fill = IR_label, color = IR_label)) +
      geom_violin(alpha = 0.5, size = 0.8) +
      geom_jitter(width = 0.1, size = 1.5, alpha = 0.8) +
      scale_fill_manual(values = fill_colors) +
      scale_color_manual(values = edge_colors) +
      labs(title = plot_title, x = "IR Status", y = plot_title) +
      theme_classic(base_size = 9) +
      theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
            axis.text.y = element_text(face = "bold"),
            legend.position = "none",
            aspect.ratio = 1.3)
    
  } else {
    # Ensure factor for categorical variables
    plot_data[[var_name]] <- factor(plot_data[[var_name]], levels = unique(plot_data[[var_name]]))
    
    # Generalized categorical plot
    tbl <- plot_data %>%
      group_by(.data[[var_name]], IR_label) %>%
      summarise(count = n(), .groups = "drop") %>%
      tidyr::complete(.data[[var_name]], IR_label = c("IR", "Non-IR"), fill = list(count = 0)) %>%
      group_by(IR_label) %>%
      mutate(perc = 100 * count / sum(count)) %>%
      ungroup()
    
    # Use dodge if multiple categories, single bar if only one (like clinical_site with one site)
    if (length(unique(plot_data[[var_name]])) == 1) {
      # Single bar
      p <- ggplot(tbl, aes(x = .data[[var_name]], y = perc, fill = IR_label)) +
        geom_col(color = "black") +
        scale_fill_manual(values = fill_colors) +
        labs(title = plot_title, x = "", y = "Percent") +
        theme_classic(base_size = 9) +
        theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
              axis.text.x = element_text(face = "bold"),
              axis.text.y = element_text(face = "bold"),
              legend.position = "right",
              aspect.ratio = 1)
    } else {
      # Multiple categories
      p <- ggplot(tbl, aes(x = .data[[var_name]], y = perc, fill = IR_label)) +
        geom_col(position = "dodge", color = "black") +
        scale_fill_manual(values = fill_colors) +
        labs(title = plot_title, x = plot_title, y = "Percent") +
        theme_classic(base_size = 9) +
        theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 9),
              axis.text.x = element_text(angle = 45, hjust = 1),
              axis.text.y = element_text(face = "bold"),
              legend.position = "none",
              aspect.ratio = 1.3)
    }
  }
  
  return(p)
}

# ============================================================
# 6. Generate plots
# ============================================================
plot_list <- lapply(
  vars_to_plot,
  plot_ir_variable,
  data = df_ucsd_final,
  short_names = short_names,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list <- plot_list[!sapply(plot_list, is.null)]
big_panel <- wrap_plots(plot_list, ncol = 4)
big_panel

# ============================================================
# 7. Numeric and categorical summaries
# ============================================================
# ============================================================
# 7. Numeric and categorical summaries
# ============================================================

num_vars <- vars_to_plot[sapply(df_ucsd_final[vars_to_plot], is.numeric)]
cat_vars <- vars_to_plot[sapply(df_ucsd_final[vars_to_plot], function(x) !is.numeric(x))]

# -----------------------------
# Numeric summary
# -----------------------------
num_summary <- lapply(num_vars, function(var) {
  data <- df_ucsd_final %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = ifelse(length(IR_vals) > 0, sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)), "–"),
    Non_IR = ifelse(length(NonIR_vals) > 0, sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)), "–"),
    Total = ifelse(length(Total_vals) > 0, sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)), "–"),
    `p-value` = p_val
  )
}) %>% bind_rows()

# -----------------------------
# Categorical summary (fixed)
# -----------------------------
cat_summary <- lapply(cat_vars, function(var) {
  
  tbl <- table(df_ucsd_final[[var]], df_ucsd_final$IR_label)
  
  # Safe extraction
  IR_col <- if ("IR" %in% colnames(tbl)) tbl[, "IR"] else rep(0, nrow(tbl))
  NonIR_col <- if ("Non-IR" %in% colnames(tbl)) tbl[, "Non-IR"] else rep(0, nrow(tbl))
  total_col <- IR_col + NonIR_col
  total_total <- sum(total_col)
  
  # Chi-square test
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  # Percent of **total subjects** per group
  IR_str <- sprintf("%d (%.1f%%)", IR_col, 100 * IR_col / total_total)
  NonIR_str <- sprintf("%d (%.1f%%)", NonIR_col, 100 * NonIR_col / total_total)
  Total_str <- sprintf("%d (%.1f%%)", total_col, 100 * total_col / total_total)
  
  tibble(
    Variable = ifelse(var %in% names(short_names), short_names[[var]], var),
    IR = paste(IR_str, collapse = "; "),
    Non_IR = paste(NonIR_str, collapse = "; "),
    Total = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

# -----------------------------
# Combine numeric + categorical
# -----------------------------
final_table <- bind_rows(num_summary, cat_summary)

# Description row
description_row <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD OR count (%)",
  Non_IR = "Non-IR = mean ± SD OR count (%)",
  Total = "Total = mean ± SD OR count (%)",
  `p-value` = "t-test or χ²"
)

final_table2 <- bind_rows(description_row, final_table)

# -----------------------------
# Add significance stars
# -----------------------------
add_stars <- function(p) {
  p_num <- suppressWarnings(as.numeric(p))
  if (grepl("<0.001", p)) return("**<0.001** ***")
  if (is.na(p_num)) return(p)
  if (p_num < 0.001) return("**<0.001** ***")
  if (p_num < 0.01)   return(paste0("**", p, "** **"))
  if (p_num < 0.05)   return(paste0("**", p, "** *"))
  return(p)
}

final_table2$`p-value` <- sapply(final_table2$`p-value`, add_stars)

# -----------------------------
# Highlight significant rows
# -----------------------------
p_char <- final_table$`p-value`
p_numeric <- suppressWarnings(as.numeric(gsub("<|\\*|\\s", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1  # +1 for description row

# -----------------------------
# Render table
# -----------------------------
kbl(final_table2, "html",
    caption = "Table 1. Comparison of variables by IR status (Study 2)",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows, background = "#F5FBFF")
Table 1. Comparison of variables by IR status (Study 2)
Variable IR Non_IR Total p-value
Note: IR = mean ± SD OR count (%) Non-IR = mean ± SD OR count (%) Total = mean ± SD OR count (%) t-test or χ²
Age 61.27 ± 10.83 61.83 ± 9.68 61.57 ± 10.14 0.839
Weight (kg) 95.39 ± 19.94 75.18 ± 15.58 84.56 ± 20.30 <0.001 ***
BMI 34.31 ± 8.48 26.82 ± 4.65 30.30 ± 7.64 <0.001 ***
Hip_Circumference 115.88 ± 19.46 101.72 ± 11.46 108.29 ± 17.09 0.002 **
Waist_Circumference 110.62 ± 13.45 91.81 ± 14.72 100.54 ± 16.92 <0.001 ***
WaistHip_Ratio 0.96 ± 0.08 0.90 ± 0.10 0.93 ± 0.09 0.011 *
PulseHR 67.12 ± 10.84 62.73 ± 7.28 64.77 ± 9.29 0.088
Diastolic_BP1 80.35 ± 10.76 76.93 ± 13.14 78.52 ± 12.11 0.290
Diastolic_BP2 78.23 ± 12.48 75.07 ± 11.10 76.54 ± 11.76 0.324
Systolic_BP1 129.46 ± 17.74 132.03 ± 26.06 130.84 ± 22.42 0.664
Systolic_BP2 127.88 ± 19.29 131.27 ± 22.23 129.70 ± 20.80 0.545
Clinical_Site 26 (46.4%) 30 (53.6%) 56 (100.0%) 0.593

3.2 Demographics

3.2.1 Study 1

### ======================================================================
### LOAD & CLEAN DATA
### ======================================================================

### Load “other” demographic variables
other_vars <- read.csv("~/all_vars.csv") %>% 
  select(
    fh_dm2sb, years_of_education, sualcage, person_id, fh_dm2pt,
    sualckncf, suwndosfr, suwnkncf, subrkncf, subrdosfr, sulqdosfr, sulqkncf, pxhic1
  ) %>% 
  mutate(across(everything(), ~na_if(., 777)))

other_vars$participant_id <- other_vars$person_id


### Merge with df_final
df_f <- merge(other_vars, df_final, by = "participant_id")


### Load additional datasets
sleep <- read.csv("~/Downloads/sleep_participant_summary.csv")
steps <- read.csv("~/Downloads/steps_participant_summary.csv")

diet <- read.csv("~/Downloads/diet_data.csv") %>%
  mutate(across(everything(), ~na_if(., 777)))

hrv <- read.csv("~/Downloads/HRV_Joe.csv") %>%
  select(hr_from_ecg, hrv_rmssd, subject_id)

all_stress <- read.csv("~/Downloads/stress_overall_daily.csv")

survey <- read.csv("~/Downloads/survey_data.csv") %>%
  select(
    person_id,
    dmlpor..When.reflecting.on.your.typical.eating.ha,
    dmlfrveg..In.a.typical.day..how.many.servings.of.,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlact..What.would.you.consider.your.typical.acti
  ) %>%
  mutate(across(everything(), ~na_if(., 777)))


### Merge all datasets
df_test <- df_final %>%
  left_join(other_vars, by = "participant_id") %>%
  left_join(sleep, by = "participant_id") %>%
  left_join(steps, by = "participant_id") %>%
  left_join(diet, by = c("participant_id" = "person_id")) %>%
  left_join(hrv, by = c("participant_id" = "subject_id")) %>%
  left_join(all_stress, by = c("participant_id" = "id")) %>%
  left_join(survey, by = c("participant_id" = "person_id"))

df_test <- df_test %>% group_by(participant_id) %>% slice(1) %>% ungroup()



### ======================================================================
### RECODE VARIABLES
### ======================================================================

df_test$pxhic1_cat <- factor(
  ifelse(df_test$pxhic1 == 1, "yes",
  ifelse(df_test$pxhic1 == 0, "no", NA)),
  levels = c("no", "yes")
)

df_test$suwnkncf_cat <- factor(
  ifelse(df_test$suwnkncf == 1, "yes",
  ifelse(df_test$suwnkncf == 0, "no", NA)),
  levels = c("no", "yes")
)

df_test$sulqkncf_cat <- factor(
  ifelse(df_test$sulqkncf == 1, "yes",
  ifelse(df_test$sulqkncf == 0, "no",
  ifelse(df_test$sulqkncf == 777, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_test$fh_dm2pt_cat <- factor(
  ifelse(df_test$fh_dm2pt == 1, "yes",
  ifelse(df_test$fh_dm2pt == 0, "no",
  ifelse(df_test$fh_dm2pt == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_test$fh_dm2sb_cat <- factor(
  ifelse(df_test$fh_dm2sb == 1, "yes",
  ifelse(df_test$fh_dm2sb == 0, "no",
  ifelse(df_test$fh_dm2sb == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_test$fh_dm2_combined <- ifelse(
  df_test$fh_dm2pt == 555 | df_test$fh_dm2sb == 555, "prefer not answer",
  ifelse(df_test$fh_dm2pt == 1 | df_test$fh_dm2sb == 1, "yes",
  ifelse(df_test$fh_dm2pt == 0 & df_test$fh_dm2sb == 0, "no", NA))
)

df_test$fh_dm2_combined <- factor(
  df_test$fh_dm2_combined,
  levels = c("no", "yes", "prefer not answer")
)



### ======================================================================
### DEFINE short_names2 (FULL + COMPLETE)
### ======================================================================

short_names2 <- c(
  "pxhic1_cat" = "Insurance",
  "sualcage" = "Total years drinking",
  "suwndosfr" = "Wine (weekly)",
  "suwnkncf" = "Wine in past year",
  "sulqkncf" = "Liquor in past year",
  "sulqdosfr" = "Liquor (weekly)",
  "subrkncf" = "Beer/Ale in past year",
  "subrdosfr" = "Beer/Ale (weekly)",
  "fh_dm2_combined" = "Family T2D history (any)",
  "fh_dm2pt_cat" = "Parent T2D history",
  "fh_dm2sb_cat" = "Sibling T2D history",
  "years_of_education" = "Years of education"
)

### ======================================================================
### SPLIT INTO TWO VARIABLE SETS
### ======================================================================

vars_demo <- c(
  "pxhic1_cat",
  "fh_dm2_combined",
  "fh_dm2pt_cat",
  "fh_dm2sb_cat",
  "years_of_education"
)

vars_alcohol <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

short_demo    <- short_names2[names(short_names2) %in% vars_demo]
short_alcohol <- short_names2[names(short_names2) %in% vars_alcohol]

### ======================================================================
### PLOT PANELS
### (Assumes you already defined plot_ir_variable() earlier)
### ======================================================================

plot_list_demo <- lapply(
  vars_demo,
  plot_ir_variable,
  data = df_test,
  short_names = short_demo,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_demo <- plot_list_demo[!sapply(plot_list_demo, is.null)]
big_panel_demo <- wrap_plots(plot_list_demo, ncol = 5)
big_panel_demo

### ======================================================================
### SUMMARY TABLE FUNCTION
### ======================================================================

add_stars <- function(p) {
  if (p == "–") return("–")
  pnum <- suppressWarnings(as.numeric(gsub("<", "", p)))
  if (is.na(pnum)) return(p)
  if (pnum < 0.001) return("**<0.001**")
  if (pnum < 0.01) return(paste0(p, " **"))
  if (pnum < 0.05) return(paste0(p, " *"))
  return(p)
}


create_summary_table <- function(var_list, short_names, df) {
  
  num_vars <- var_list[sapply(df[var_list], is.numeric)]
  cat_vars <- var_list[sapply(df[var_list], function(x) !is.numeric(x))]
  
  num_summary <- lapply(num_vars, function(var) {
    data <- df %>% filter(!is.na(.data[[var]]))
    IR_vals    <- data %>% filter(IR_label == "IR") %>% pull(var)
    NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
    Total_vals <- data[[var]]
    
    t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
    p_raw <- if (!is.null(t_res)) t_res$p.value else NA
    
    p_val <- if (is.na(p_raw)) "–"
      else if (p_raw < 0.001) "<0.001"
      else formatC(p_raw, digits = 3, format = "f")
    
    tibble(
      Variable = short_names[[var]],
      IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
      Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
      Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  
  cat_summary <- lapply(cat_vars, function(var) {
    tbl <- table(df[[var]], df$IR_label)
    IR_total    <- sum(tbl[, "IR"])
    NonIR_total <- sum(tbl[, "Non-IR"])
    total_total <- sum(tbl)
    
    chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
    p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
    
    p_val <- if (is.na(p_raw)) "–"
      else if (p_raw < 0.001) "<0.001"
      else formatC(p_raw, digits = 3, format = "f")
    
    IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2], 100*x[2]/IR_total))
    NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1], 100*x[1]/NonIR_total))
    Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
    
    tibble(
      Variable = short_names[[var]],
      IR = paste(IR_str, collapse = "; "),
      Non_IR = paste(NonIR_str, collapse = "; "),
      Total = paste(Total_str, collapse = "; "),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  
  final_table <- bind_rows(num_summary, cat_summary)
  
  missing_summary <- sapply(var_list, function(v) {
    n_miss <- sum(is.na(df[[v]]))
    pct <- round(100 * n_miss / nrow(df), 1)
    sprintf("%d (%.1f%%)", n_miss, pct)
  })
  
  final_table$`Missing (n, %)` <- missing_summary
  
  note_row <- tibble(
    Variable = "Note:",
    IR = "IR = mean ± SD or count (%)",
    Non_IR = "Non-IR = mean ± SD or count (%)",
    Total = "Total = mean ± SD or count (%)",
    `p-value` = "t-test or χ² test",
    `Missing (n, %)` = ""
  )
  
  final_table <- bind_rows(note_row, final_table)
  
  final_table$`p-value` <- sapply(final_table$`p-value`, add_stars)
  
  final_table
}

### ======================================================================
### GENERATE DEMOGRAPHIC TABLE
### ======================================================================

table_demo <- create_summary_table(vars_demo, short_demo, df_test)

# Identify significant rows
p_char <- table_demo$`p-value`[-1] # skip Note row
p_numeric <- suppressWarnings(as.numeric(gsub("[*<]", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1  # +1 for Note row

kbl(table_demo, "html",
    caption = "Study 1. Demographic Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%                          # Bold header
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>% # Italic Note row with background
  column_spec(1, bold = TRUE) %>%                       # Bold Variable column
  column_spec(5, italic = TRUE) %>%                     # Italic p-value column
  row_spec(sig_rows, background = "#F5FBFF")
Study 1. Demographic Variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test
Years of education 15.13 ± 3.78 16.17 ± 3.67 15.70 ± 3.74 0.105 25 (18.1%)
Insurance 13 (24.5%); 47 (88.7%) 24 (40.0%); 29 (48.3%) 37 (32.7%); 76 (67.3%) 0.014 * 0 (0.0%)
Family T2D history (any) 30 (48.4%); 37 (59.7%); 9 (14.5%) 18 (23.7%); 28 (36.8%); 16 (21.1%) 48 (34.8%); 65 (47.1%); 25 (18.1%) 0.089 0 (0.0%)
Parent T2D history 37 (59.7%); 33 (53.2%); 6 (9.7%) 23 (30.3%); 30 (39.5%); 9 (11.8%) 60 (43.5%); 63 (45.7%); 15 (10.9%) 0.270 0 (0.0%)
Sibling T2D history 55 (88.7%); 16 (25.8%); 5 (8.1%) 35 (46.1%); 14 (18.4%); 13 (17.1%) 90 (65.2%); 30 (21.7%); 18 (13.0%) 0.034 * 0 (0.0%)

3.2.2 Study 2

### ======================================================================
### STUDY 2: LOAD & CLEAN DATA
### ======================================================================
# Load “other” demographic variables
other_vars2 <- read.csv("~/all_vars.csv") %>% 
  select(
    fh_dm2sb, years_of_education, sualcage, person_id, fh_dm2pt,
    sualckncf, suwndosfr, suwnkncf, subrkncf, subrdosfr, sulqdosfr, sulqkncf, pxhic1
  ) %>% 
  mutate(across(everything(), ~na_if(., 777)))

other_vars2$participant_id <- other_vars2$person_id

# Merge with df_ucsd_final
df_s2 <- merge(other_vars2, df_ucsd_final, by = "participant_id")

# Load additional datasets (adjust paths as needed)
sleep2 <- read.csv("~/Downloads/sleep_participant_summary.csv")
steps2 <- read.csv("~/Downloads/steps_participant_summary.csv")
diet2 <- read.csv("~/Downloads/diet_data.csv") %>%
  mutate(across(everything(), ~na_if(., 777)))
hrv2 <- read.csv("~/Downloads/HRV_Joe.csv") %>%
  select(hr_from_ecg, hrv_rmssdc_hr_merged, hrv_rmssd_outlier, ecg_snr, hrv_rmssd, subject_id)
all_stress2 <- read.csv("~/Downloads/stress_overall_daily.csv")
survey2 <- read.csv("~/Downloads/survey_data.csv") %>%
  select(
    person_id,
    dmlpor..When.reflecting.on.your.typical.eating.ha,
    dmlfrveg..In.a.typical.day..how.many.servings.of.,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlact..What.would.you.consider.your.typical.acti
  ) %>%
  mutate(across(everything(), ~na_if(., 777)))

# Merge all datasets
df_s2 <- df_s2 %>%
  left_join(sleep2, by = "participant_id") %>%
  left_join(steps2, by = "participant_id") %>%
  left_join(diet2, by = c("participant_id" = "person_id")) %>%
  left_join(hrv2, by = c("participant_id" = "subject_id")) %>%
  left_join(all_stress2, by = c("participant_id" = "id")) %>%
  left_join(survey2, by = c("participant_id" = "person_id"))

##prob
df_s2 = df_s2 %>% dplyr::group_by(participant_id) %>% slice(1) %>% ungroup()
#56

### ======================================================================
### RECODE VARIABLES
### ======================================================================

# Insurance
df_s2$pxhic1_cat <- factor(
  ifelse(df_s2$pxhic1 == 1, "yes",
  ifelse(df_s2$pxhic1 == 0, "no", NA)),
  levels = c("no", "yes")
)

# Alcohol variables
df_s2$suwndosfr_cat <- factor(
  ifelse(df_s2$suwndosfr == 1, "yes",
  ifelse(df_s2$suwndosfr == 0, "no", NA)),
  levels = c("no", "yes")
)

df_s2$suwnkncf_cat <- factor(
  ifelse(df_s2$suwnkncf == 1, "yes",
  ifelse(df_s2$suwnkncf == 0, "no", NA)),
  levels = c("no", "yes")
)

df_s2$sulqkncf_cat <- factor(
  ifelse(df_s2$sulqkncf == 1, "yes",
  ifelse(df_s2$sulqkncf == 0, "no",
  ifelse(df_s2$sulqkncf == 777, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

# Family history
df_s2$fh_dm2pt_cat <- factor(
  ifelse(df_s2$fh_dm2pt == 1, "yes",
  ifelse(df_s2$fh_dm2pt == 0, "no",
  ifelse(df_s2$fh_dm2pt == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_s2$fh_dm2sb_cat <- factor(
  ifelse(df_s2$fh_dm2sb == 1, "yes",
  ifelse(df_s2$fh_dm2sb == 0, "no",
  ifelse(df_s2$fh_dm2sb == 555, "prefer not answer", NA))),
  levels = c("no", "yes", "prefer not answer")
)

df_s2$fh_dm2_combined <- ifelse(
  df_s2$fh_dm2pt == 555 | df_s2$fh_dm2sb == 555, "prefer not answer",
  ifelse(df_s2$fh_dm2pt == 1 | df_s2$fh_dm2sb == 1, "yes",
  ifelse(df_s2$fh_dm2pt == 0 & df_s2$fh_dm2sb == 0, "no", NA))
)

df_s2$fh_dm2_combined <- factor(
  df_s2$fh_dm2_combined,
  levels = c("no", "yes", "prefer not answer")
)

### ======================================================================
### DEFINE SHORT NAMES
### ======================================================================

short_names2 <- c(
  "pxhic1_cat" = "Insurance",
  "sualcage" = "Total years drinking",
  "suwndosfr" = "Wine (weekly)",
  "suwnkncf" = "Wine in past year",
  "sulqkncf" = "Liquor in past year",
  "sulqdosfr" = "Liquor (weekly)",
  "subrkncf" = "Beer/Ale in past year",
  "subrdosfr" = "Beer/Ale (weekly)",
  "fh_dm2_combined" = "Family T2D history (any)",
  "fh_dm2pt_cat" = "Parent T2D history",
  "fh_dm2sb_cat" = "Sibling T2D history",
  "years_of_education" = "Years of education"
)

### ======================================================================
### SELECT VARIABLES FOR DEMOGRAPHIC TABLE
### ======================================================================

vars_demo2 <- c(
  "pxhic1_cat",
  "fh_dm2_combined",
  "fh_dm2pt_cat",
  "fh_dm2sb_cat",
  "years_of_education"
)

short_demo2 <- short_names2[names(short_names2) %in% vars_demo2]

### ======================================================================
### CREATE SUMMARY TABLE FUNCTION (APA STYLE)
### ======================================================================

create_summary_table2 <- function(var_list, short_names, df) {
  num_vars <- var_list[sapply(df[var_list], is.numeric)]
  cat_vars <- var_list[sapply(df[var_list], function(x) !is.numeric(x))]
  
  num_summary <- lapply(num_vars, function(var) {
    data <- df %>% filter(!is.na(.data[[var]]))
    IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
    NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
    Total_vals <- data[[var]]
    
    t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
    p_raw <- if (!is.null(t_res)) t_res$p.value else NA
    p_val <- if (is.na(p_raw)) "–"
    else if (p_raw < 0.001) "<0.001"
    else formatC(p_raw, digits = 3, format = "f")
    
    tibble(
      Variable = short_names[[var]],
      IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
      Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
      Total = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  cat_summary <- lapply(cat_vars, function(var) {
    tbl <- table(df[[var]], df$IR_label)
    IR_total <- sum(tbl[, "IR"])
    NonIR_total <- sum(tbl[, "Non-IR"])
    total_total <- sum(tbl)
    
    chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
    p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
    p_val <- if (is.na(p_raw)) "–"
    else if (p_raw < 0.001) "<0.001"
    else formatC(p_raw, digits = 3, format = "f")
    
    # Special case for clinical_site
    if (var == "clinical_site") {
      IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["IR"], 100*x["IR"]/total_total))
      NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["Non-IR"], 100*x["Non-IR"]/total_total))
      Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
    } else {
      IR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["IR"], 100*x["IR"]/IR_total))
      NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x["Non-IR"], 100*x["Non-IR"]/NonIR_total))
      Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
    }
    
    tibble(
      Variable = short_names[[var]],
      IR = paste(IR_str, collapse = "; "),
      Non_IR = paste(NonIR_str, collapse = "; "),
      Total = paste(Total_str, collapse = "; "),
      `p-value` = p_val
    )
  }) %>% bind_rows()
  
  final_table <- bind_rows(num_summary, cat_summary)
  
  missing_summary <- sapply(var_list, function(v) {
    n_miss <- sum(is.na(df[[v]]))
    pct <- round(100*n_miss/nrow(df),1)
    sprintf("%d (%.1f%%)", n_miss, pct)
  })
  final_table$`Missing (n, %)` <- missing_summary
  
  note_row <- tibble(
    Variable = "Note:",
    IR = "IR = mean ± SD or count (%)",
    Non_IR = "Non-IR = mean ± SD or count (%)",
    Total = "Total = mean ± SD or count (%)",
    `p-value` = "t-test or χ² test",
    `Missing (n, %)` = ""
  )
  final_table <- bind_rows(note_row, final_table)
  
  final_table$`p-value` <- sapply(final_table$`p-value`, add_stars)
  final_table
}

### ======================================================================
### GENERATE STUDY 2 DEMOGRAPHIC TABLE
### ======================================================================

table_demo2 <- create_summary_table2(vars_demo2, short_demo2, df_s2)

# Identify significant rows
p_char <- table_demo2$`p-value`[-1] # skip Note row
p_numeric <- suppressWarnings(as.numeric(gsub("[*<]", "", p_char)))
sig_rows <- which(!is.na(p_numeric) & p_numeric < 0.05) + 1
# Render APA-style table
# Remove NULLs in case any plots failed
plot_list_demo2 <- lapply(
  vars_demo2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_demo2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_demo2 <- plot_list_demo2[!sapply(plot_list_demo2, is.null)]

# Combine into a single panel
big_panel_demo2 <- wrap_plots(plot_list_demo2, ncol = 5)
big_panel_demo2

kbl(table_demo2, "html",
    caption = "Study 2. Demographic Variables by IR Status (Study 2)",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows, background = "#F5FBFF")
Study 2. Demographic Variables by IR Status (Study 2)
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test
Years of education 15.54 ± 2.92 16.60 ± 2.51 16.11 ± 2.73 0.154 13 (23.2%)
Insurance 10 (47.6%); 11 (52.4%) 6 (27.3%); 16 (72.7%) 16 (37.2%); 27 (62.8%) 0.287 0 (0.0%)
Family T2D history (any) 10 (38.5%); 12 (46.2%); 4 (15.4%) 15 (50.0%); 9 (30.0%); 6 (20.0%) 25 (44.6%); 21 (37.5%); 10 (17.9%) 0.461 1 (1.8%)
Parent T2D history 12 (48.0%); 10 (40.0%); 3 (12.0%) 17 (56.7%); 9 (30.0%); 4 (13.3%) 29 (52.7%); 19 (34.5%); 7 (12.7%) 0.738 0 (0.0%)
Sibling T2D history 15 (57.7%); 9 (34.6%); 2 (7.7%) 20 (66.7%); 6 (20.0%); 4 (13.3%) 35 (62.5%); 15 (26.8%); 6 (10.7%) 0.427 0 (0.0%)

3.3 Alcohol Use

3.3.1 Study 1

### ======================================================================
### GENERATE ALCOHOL TABLE
### ======================================================================
### ======================================================================
### STUDY 2: PLOT PANELS
### ======================================================================
plot_list_alcohol <- lapply(
  vars_alcohol,
  plot_ir_variable,
  data = df_test,
  short_names = short_alcohol,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_alcohol <- plot_list_alcohol[!sapply(plot_list_alcohol, is.null)]
big_panel_alcohol <- wrap_plots(plot_list_alcohol, ncol = 4)
big_panel_alcohol

table_alcohol <- create_summary_table(vars_alcohol, short_alcohol, df_test)


kbl(table_alcohol, "html",
    caption = "Study 1. Alcohol Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(sig_rows, background = "#F5FBFF")
Study 1. Alcohol Variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test
Total years drinking 19.76 ± 7.22 18.70 ± 3.29 19.20 ± 5.48 0.344 31 (22.5%)
Wine (weekly) 0.90 ± 1.40 2.97 ± 4.68 2.09 ± 3.78 0.012 * 70 (50.7%)
Wine in past year 0.58 ± 0.50 0.68 ± 0.47 0.64 ± 0.48 0.270 31 (22.5%)
Liquor in past year 0.70 ± 0.46 0.60 ± 0.49 0.64 ± 0.48 0.266 31 (22.5%)
Liquor (weekly) 1.29 ± 2.64 2.18 ± 3.24 1.72 ± 2.96 0.218 70 (50.7%)
Beer/Ale in past year 0.50 ± 0.51 0.63 ± 0.49 0.57 ± 0.50 0.174 31 (22.5%)
Beer/Ale (weekly) 1.08 ± 2.55 3.14 ± 6.78 2.30 ± 5.52 0.103 77 (55.8%)

3.3.2 Study 2

vars_alcohol2 <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

short_demo2 <- short_names2[names(short_names2) %in% vars_demo2]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]
# -------------------------------------
# Alcohol plots
# -------------------------------------
plot_list_alcohol2 <- lapply(
  vars_alcohol2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_alcohol2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_alcohol2 <- plot_list_alcohol2[!sapply(plot_list_alcohol2, is.null)]
big_panel_alcohol2 <- wrap_plots(plot_list_alcohol2, ncol = 4)
big_panel_alcohol2

### ======================================================================
### STUDY 2 — SAFE ALCOHOL VARIABLE LIST
### ======================================================================

vars_alcohol2 <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

# Keep ONLY variables that exist in df_s2
vars_alcohol2 <- vars_alcohol2[vars_alcohol2 %in% names(df_s2)]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]


### ======================================================================
### STUDY 2 — ALCOHOL PLOTS
### ======================================================================

plot_list_alcohol2 <- lapply(
  vars_alcohol2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_alcohol2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)


### ======================================================================
### STUDY 2 ALCOHOL TABLE  — FIXED
### ======================================================================
vars_alcohol2 <- c(
  "sualcage",
  "suwndosfr",
  "suwnkncf",
  "sulqkncf",
  "sulqdosfr",
  "subrkncf",
  "subrdosfr"
)

# Keep ONLY variables that exist in df_s2
vars_alcohol2 <- vars_alcohol2[vars_alcohol2 %in% names(df_s2)]
short_alcohol2 <- short_names2[names(short_names2) %in% vars_alcohol2]


### ======================================================================
### STUDY 2 — ALCOHOL PLOTS

### ======================================================================
### STUDY 2 ALCOHOL TABLE  — FIXED
### ======================================================================

table_alcohol2 <- create_summary_table(vars_alcohol2, short_alcohol2, df_s2)

kbl(table_alcohol2, "html",          # <-- FIXED: now using table_alcohol2
    caption = "Study 2. Alcohol Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center")  %>%
  row_spec(sig_rows, background = "#F5FBFF")
Study 2. Alcohol Variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test
Total years drinking 19.32 ± 5.76 19.08 ± 3.98 19.20 ± 4.90 0.865 6 (10.7%)
Wine (weekly) 1.42 ± 2.15 1.12 ± 1.50 1.25 ± 1.78 0.692 28 (50.0%)
Wine in past year 0.48 ± 0.51 0.64 ± 0.49 0.56 ± 0.50 0.264 6 (10.7%)
Liquor in past year 0.60 ± 0.50 0.56 ± 0.51 0.58 ± 0.50 0.780 6 (10.7%)
Liquor (weekly) 1.80 ± 3.86 0.71 ± 1.33 1.28 ± 2.93 0.319 27 (48.2%)
Beer/Ale in past year 0.56 ± 0.51 0.68 ± 0.48 0.62 ± 0.49 0.392 6 (10.7%)
Beer/Ale (weekly) 1.71 ± 3.02 0.88 ± 1.96 1.26 ± 2.49 0.385 25 (44.6%)

3.4 Physical Activity

3.4.1 Study 1

# other_vars <- read.csv("~/all_vars.csv") %>% 
#   select(fh_dm2sb, years_of_education, sualcage, sualckncf, sualcdrv, pxhic1)  %>%
#   mutate(across(everything(), ~na_if(., 777)))
# 
# other_vars$participant_id = other_vars$person_id
# #length(other_vars$participant_id)
# 
# df_f = merge(other_vars, df_final,by= "person_id", by.y="participant_id")
# #length(df_f$participant_id) #138
# sleep <- read.csv("~/Downloads/sleep_participant_summary.csv")
# steps <- read.csv("~/Downloads/steps_participant_summary.csv")
# diet <- read.csv("~/Downloads/diet_data.csv") %>%
#   mutate(across(everything(), ~na_if(., 777)))
# hrv  <- read.csv("~/Downloads/HRV_Joe.csv") %>%
#   dplyr::select(hr_from_ecg, hrv_rmssdc_hr_merged, hrv_rmssd_outlier, ecg_snr, hrv_rmssd, subject_id)
# all_stress <- read.csv("~/Downloads/stress_overall_daily.csv")
# survey = read.csv("~/Downloads/survey_data.csv") %>%
#   # Select the columns you want
#   dplyr::select(
#     person_id,
#     dmlpor..When.reflecting.on.your.typical.eating.ha,
#     dmlfrveg..In.a.typical.day..how.many.servings.of.,
#     dmlsugar..When.reflecting.on.your.typical.eating.,
#     dmlvex..In.a.typical.week..how.often.do.you.engag,
#     dmlgrain..In.a.typical.day..how.often.do.you.cons,
#     dmlhex..How.often.would.you.say.that.you.engage.i,
#     dmlact..What.would.you.consider.your.typical.acti,
#   ) %>%
#   # Replace 777 with NA
#   dplyr::mutate(across(everything(), ~na_if(., 777)))
# 
# #cgm <- read.csv("~/Downloads/cgm_cleaned_daily.csv")
# 
# df_test <- df_final %>%
#   left_join(other_vars, by = "participant_id") %>%
#   left_join(sleep, by = "participant_id") %>%
#   left_join(steps, by = "participant_id") %>%
#   left_join(diet, by = c("participant_id" = "person_id")) %>%
#   left_join(hrv, by = c("participant_id" = "subject_id")) %>%
#   left_join(all_stress, by = c("participant_id" = "id")) %>% 
#   left_join(survey, by = c("participant_id" = "person_id"))
# 
# # Keep only one row per participant
# df_test <- df_test %>%
#   group_by(participant_id) %>%
#   slice(1) %>%   # keep the first row per participant
#   ungroup()

# Check that each participant appears only once
#table(duplicated(df_test$participant_id))  # should all be FALSE 112
Study 1. Physical Activity by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Vig Exercise Frequency 3.71 ± 4.04 5.66 ± 4.19 4.78 ± 4.22 0.006 ** 0 (0%)
Typical Activity Level 5.81 ± 3.75 7.43 ± 3.51 6.70 ± 3.70 0.010 * 0 (0%)
Mean Total Steps 7357.54 ± 4118.51 7549.96 ± 4239.77 7463.51 ± 4171.62 0.788 0 (0%)
SD Total Steps 6239.10 ± 2556.69 6669.63 ± 3222.56 6476.20 ± 2939.70 0.383 0 (0%)
CV Total Steps 1.00 ± 0.59 0.97 ± 0.32 0.98 ± 0.46 0.708 0 (0%)
Mean Active Minutes 182.29 ± 107.27 184.60 ± 87.17 183.56 ± 96.36 0.892 0 (0%)
Mean Activity Onset 2.91 ± 1.13 2.90 ± 1.08 2.90 ± 1.10 0.941 0 (0%)
SD Activity Onset 4.07 ± 1.16 4.04 ± 0.97 4.05 ± 1.05 0.863 0 (0%)
Pct Steps Morning 16.09 ± 7.48 16.54 ± 7.35 16.33 ± 7.38 0.723 0 (0%)
Pct Steps Afternoon 24.82 ± 9.26 22.46 ± 6.78 23.52 ± 8.05 0.097 0 (0%)
Pct Steps Evening 18.86 ± 7.03 19.02 ± 7.21 18.95 ± 7.10 0.897 0 (0%)
Activity Midpoint 11.24 ± 1.16 11.10 ± 1.12 11.16 ± 1.14 0.484 0 (0%)
Mean Steps Weekend 7500.16 ± 4628.36 6984.66 ± 4292.10 7216.26 ± 4437.28 0.503 0 (0%)
Mean Steps Weekday 7284.68 ± 4238.04 7759.95 ± 4874.57 7546.42 ± 4589.29 0.541 0 (0%)

3.4.2 Study 2

### ======================================================================
### STUDY 2 — PHYSICAL ACTIVITY PLOTS + TABLE 
### Using df_ucsd_final
### ======================================================================

# ---------------------------------------------------------
# VARIABLES TO PLOT (must match Study 1 structure exactly)
# ---------------------------------------------------------
vars_to_plot_s2 <- c(
  "dmlhex..How.often.would.you.say.that.you.engage.",
  "dmlvex..In.a.typical.week..how.often.do.you.engag",
  "dmlact..What.would.you.consider.your.typical.acti",
  "mean_total_steps",
  "sd_total_steps",
  "cv_total_steps",
  "mean_active_minutes",
  "mean_activity_onset_h",
  "sd_activity_onset_h",
  "mean_pct_steps_morning",
  "mean_pct_steps_afternoon",
  "mean_pct_steps_evening",
  "mean_activity_midpoint_h",
  "mean_steps_weekend",
  "mean_steps_weekday"
)

# Only keep variables that actually exist in df_s2
vars_to_plot_s2 <- vars_to_plot_s2[vars_to_plot_s2 %in% names(df_s2)]

# ---------------------------------------------------------
# SHORT NAMES — FILTER + ENSURE CHARACTER VECTOR
# ---------------------------------------------------------
short_names_s2 <- c(
  "dmlhex..How.often.would.you.say.that.you.engage." = "Home Exercise Freq.",
  "dmlvex..In.a.typical.week..how.often.do.you.engag" = "Vig. Exercise Freq.",
  "dmlact..What.would.you.consider.your.typical.acti" = "Typical Activity",
  "mean_total_steps" = "Mean Total Steps",
  "sd_total_steps" = "SD Total Steps",
  "cv_total_steps" = "CV Total Steps",
  "mean_active_minutes" = "Mean Active Min",
  "mean_activity_onset_h" = "Mean Activity Onset",
  "sd_activity_onset_h" = "SD Activity Onset",
  "mean_pct_steps_morning" = "Pct Steps Morning",
  "mean_pct_steps_afternoon" = "Pct Steps Afternoon",
  "mean_pct_steps_evening" = "Pct Steps Evening",
  "mean_activity_midpoint_h" = "Activity Midpoint",
  "mean_steps_weekend" = "Mean Steps Wknd",
  "mean_steps_weekday" = "Mean Steps Wkday"
)

# Filter + force character vector (fix for earlier Study 2 table error)
short_names_s2 <- unlist(short_names_s2[names(short_names_s2) %in% vars_to_plot_s2])

# ---------------------------------------------------------
# PLOTS (Study 2)
# ---------------------------------------------------------
plot_list_s2 <- lapply(
  vars_to_plot_s2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_names_s2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list_s2 <- plot_list_s2[!sapply(plot_list_s2, is.null)]
big_panel_s2 <- wrap_plots(plot_list_s2, ncol = 5)
big_panel_s2

### ======================================================================
### STUDY 2 — SUMMARY TABLE
### ======================================================================

num_vars_s2 <- vars_to_plot_s2[sapply(df_s2[vars_to_plot_s2], is.numeric)]
cat_vars_s2 <- vars_to_plot_s2[sapply(df_s2[vars_to_plot_s2], function(x) !is.numeric(x))]

# ---------------------------------------------------------
# Numeric variables summary
# ---------------------------------------------------------
num_summary_s2 <- lapply(num_vars_s2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names_s2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ---------------------------------------------------------
# Categorical variables summary
# ---------------------------------------------------------
cat_summary_s2 <- lapply(cat_vars_s2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names_s2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

# ---------------------------------------------------------
# Combine tables
# ---------------------------------------------------------
final_table_s2 <- bind_rows(num_summary_s2, cat_summary_s2)

final_table_s2$`p-value` <- sapply(final_table_s2$`p-value`, add_stars)

description_row_s2 <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_table_s2 <- bind_rows(description_row_s2, final_table_s2)
final_table_s2$`p-value` <- sapply(final_table_s2$`p-value`, add_stars)

# ---------------------------------------------------------
# Missingness per variable
# ---------------------------------------------------------
missing_summary_s2 <- sapply(vars_to_plot_s2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col_s2 <- c("Number of missing values per variable", missing_summary_s2)
final_table_s2$`Missing (n, %)` <- missing_col_s2

# ---------------------------------------------------------
# Significant rows
# ---------------------------------------------------------
p_char_s2 <- final_table_s2$`p-value`
p_numeric_s2 <- as.numeric(gsub("<", "", p_char_s2))
sig_rows_s2 <- which(!is.na(p_numeric_s2) & p_numeric_s2 < 0.05) + 1

# ---------------------------------------------------------
# Render
# ---------------------------------------------------------
kbl(final_table_s2, "html", 
    caption = "Study 2. Physical Activity Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows_s2, background = "#F5FBFF")
Study 2. Physical Activity Variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Vig. Exercise Freq. 5.00 ± 4.24 6.17 ± 4.49 5.62 ± 4.38 0.322 0 (0%)
Typical Activity 6.35 ± 3.89 8.17 ± 3.34 7.32 ± 3.69 0.068 0 (0%)
Mean Total Steps 5199.23 ± 4832.78 5652.44 ± 2697.07 5442.02 ± 3808.38 0.674 0 (0%)
SD Total Steps 5619.60 ± 3251.97 4783.17 ± 1750.44 5170.41 ± 2566.08 0.258 2 (3.6%)
CV Total Steps 1.24 ± 0.55 0.97 ± 0.44 1.10 ± 0.51 0.056 2 (3.6%)
Mean Active Min 135.38 ± 87.68 170.73 ± 78.37 154.32 ± 83.96 0.120 0 (0%)
Mean Activity Onset 3.14 ± 1.41 3.85 ± 1.80 3.53 ± 1.66 0.106 1 (1.8%)
SD Activity Onset 4.79 ± 1.92 4.52 ± 1.33 4.64 ± 1.62 0.556 2 (3.6%)
Pct Steps Morning 16.02 ± 10.02 19.47 ± 9.48 17.87 ± 9.80 0.193 0 (0%)
Pct Steps Afternoon 19.23 ± 11.03 23.18 ± 7.84 21.35 ± 9.57 0.135 0 (0%)
Pct Steps Evening 14.51 ± 7.04 19.77 ± 10.30 17.33 ± 9.25 0.028 * 0 (0%)
Activity Midpoint 10.94 ± 1.10 11.28 ± 1.28 11.13 ± 1.20 0.295 1 (1.8%)
Mean Steps Wknd 6190.92 ± 6284.24 6859.84 ± 4162.77 6562.54 ± 5170.32 0.656 2 (3.6%)
Mean Steps Wkday 5059.24 ± 4699.78 5228.52 ± 2367.79 5148.50 ± 3624.94 0.869 1 (1.8%)

3.5 Sleep

3.5.1 Study 1

vars_to_plot2 <- c(
  "cpd_h",
  "sd_onset_clock_h",
  "sd_midpoint_clock_h",
  "mean_wake_clock_h",
  "social_jetlag_h",
  "sri",
  "sd_wake_clock_h",
  "mean_midpoint_weekday_h",
  "mean_onset_clock_h",
  "mean_midpoint_clock_h",
  "mean_midpoint_weekend_h"
)

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]

short_names2 <- c(
  "cpd_h" = "Sleep Chronotype",
  "sd_onset_clock_h" = "SD Sleep Onset",
  "sd_midpoint_clock_h" = "Sleep Midpoint SD",
  "mean_wake_clock_h" = "Mean Wake Time",
  "social_jetlag_h" = "Social Jetlag",
  "sri" = "Sleep Reg. Index",
  "sd_wake_clock_h" = "SD Wake Time",
  "mean_midpoint_weekday_h" = "Mean Midpoint Wkday",
  "mean_onset_clock_h" = "Sleep Onset",
  "mean_midpoint_clock_h" = "M Sleep Midpoint",
  "mean_midpoint_weekend_h" = "M Midpoint Wknd"
)

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     
  data = df_test,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]

big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_test %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_test[[var]], df_test$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2 <- final_tableB2$`p-value`
p_numeric2 <- as.numeric(gsub("<", "", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_test[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_test), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)

# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 1. Sleep variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")
Study 1. Sleep variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Sleep Chronotype 1.48 ± 1.22 1.22 ± 0.98 1.34 ± 1.10 0.173 5 (3.6%)
SD Sleep Onset 1.88 ± 1.22 1.68 ± 1.25 1.77 ± 1.24 0.348 5 (3.6%)
Sleep Midpoint SD 2.01 ± 1.58 1.63 ± 1.28 1.80 ± 1.43 0.136 5 (3.6%)
Mean Wake Time 30.16 ± 2.14 30.20 ± 2.09 30.18 ± 2.11 0.893 0 (0%)
Social Jetlag 1.48 ± 1.56 1.17 ± 1.39 1.31 ± 1.47 0.244 7 (5.1%)
Sleep Reg. Index 29.21 ± 23.98 36.35 ± 24.88 33.13 ± 24.65 0.098 7 (5.1%)
SD Wake Time 3.21 ± 3.05 2.65 ± 2.72 2.90 ± 2.87 0.278 5 (3.6%)
Mean Midpoint Wkday 27.00 ± 1.56 26.83 ± 1.22 26.90 ± 1.37 0.500 7 (5.1%)
Sleep Onset 23.64 ± 1.53 23.38 ± 1.48 23.49 ± 1.50 0.317 0 (0%)
M Sleep Midpoint 26.90 ± 1.38 26.79 ± 1.25 26.84 ± 1.31 0.643 0 (0%)
M Midpoint Wknd 26.54 ± 2.26 26.58 ± 2.05 26.56 ± 2.14 0.925 7 (5.1%)

3.5.2 Study 2

# ----------------------------
# STUDY 2 
# ----------------------------
vars_to_plot2_s2 <- c(
  "cpd_h",
  "sd_onset_clock_h",
  "sd_midpoint_clock_h",
  "mean_wake_clock_h",
  "social_jetlag_h",
  "sri",
  "sd_wake_clock_h",
  "mean_midpoint_weekday_h",
  "mean_onset_clock_h",
  "mean_midpoint_clock_h",
  "mean_midpoint_weekend_h"
)

# Keep only the variables that exist
vars_to_plot2_s2 <- vars_to_plot2_s2[vars_to_plot2_s2 %in% names(df_s2)]

short_names2_s2 <- c(
  "cpd_h" = "Sleep Chronotype",
  "sd_onset_clock_h" = "SD Sleep Onset",
  "sd_midpoint_clock_h" = "Sleep Midpoint SD",
  "mean_wake_clock_h" = "Mean Wake Time",
  "social_jetlag_h" = "Social Jetlag",
  "sri" = "Sleep Reg. Index",
  "sd_wake_clock_h" = "SD WakeTime",
  "mean_midpoint_weekday_h" = "M Midpoint Weekday",
  "mean_onset_clock_h" = "Sleep Onset",
  "mean_midpoint_clock_h" = "M Sleep Midpoint",
  "mean_midpoint_weekend_h" = "M Midpoint Weekend"
)

short_names2_s2 <- short_names2_s2[names(short_names2_s2) %in% vars_to_plot2_s2]

# ----------------------------
# PLOTS FOR STUDY 2
# ----------------------------

plot_list2_s2 <- lapply(
  vars_to_plot2_s2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_names2_s2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2_s2 <- plot_list2_s2[!sapply(plot_list2_s2, is.null)]

big_panel2_s2 <- wrap_plots(plot_list2_s2, ncol = 4)
big_panel2_s2

# ----------------------------
# SUMMARY TABLE FOR STUDY 2
# ----------------------------

num_vars2_s2 <- vars_to_plot2_s2[sapply(df_s2[vars_to_plot2_s2], is.numeric)]
cat_vars2_s2 <- vars_to_plot2_s2[sapply(df_s2[vars_to_plot2_s2], function(x) !is.numeric(x))]

num_summary2_s2 <- lapply(num_vars2_s2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2_s2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()


cat_summary2_s2 <- lapply(cat_vars2_s2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2_s2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()


final_tableB2_s2 <- bind_rows(num_summary2_s2, cat_summary2_s2)

final_tableB2_s2$`p-value` <- sapply(final_tableB2_s2$`p-value`, add_stars)

# Add note row
description_rowB_s2 <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2_s2 <- bind_rows(description_rowB_s2, final_tableB2_s2)
final_tableB2_s2$`p-value` <- sapply(final_tableB2_s2$`p-value`, add_stars)

# Identify significant rows
p_char2_s2 <- final_tableB2_s2$`p-value`
p_numeric2_s2 <- as.numeric(gsub("<", "", p_char2_s2))
sig_rows2_s2 <- which(!is.na(p_numeric2_s2) & p_numeric2_s2 < 0.05) + 1

# ----------------------------
# Missingness per variable
# ----------------------------
missing_summary_s2 <- sapply(vars_to_plot2_s2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col_s2 <- c("Number of missing values per variable", missing_summary_s2)
final_tableB2_s2$`Missing (n, %)` <- missing_col_s2

# ----------------------------
# Render Table for Study 2
# ----------------------------
kbl(final_tableB2_s2, "html",
    caption = "Study 2. Sleep Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2_s2, background = "#F5FBFF")
Study 2. Sleep Variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Sleep Chronotype 1.09 ± 0.57 1.08 ± 1.04 1.09 ± 0.86 0.954 5 (8.9%)
SD Sleep Onset 1.66 ± 0.99 1.40 ± 0.96 1.51 ± 0.97 0.351 5 (8.9%)
Sleep Midpoint SD 1.48 ± 0.81 1.51 ± 1.41 1.49 ± 1.18 0.936 5 (8.9%)
Mean Wake Time 30.84 ± 1.19 30.43 ± 1.36 30.62 ± 1.29 0.245 2 (3.6%)
Social Jetlag 0.77 ± 0.61 1.06 ± 1.12 0.94 ± 0.94 0.238 6 (10.7%)
Sleep Reg. Index 30.87 ± 26.56 37.15 ± 21.35 34.33 ± 23.78 0.374 7 (12.5%)
SD WakeTime 1.90 ± 1.04 2.23 ± 2.86 2.09 ± 2.25 0.565 5 (8.9%)
M Midpoint Weekday 26.99 ± 1.35 27.00 ± 0.86 26.99 ± 1.08 0.980 6 (10.7%)
Sleep Onset 23.48 ± 1.85 23.35 ± 1.20 23.41 ± 1.52 0.759 2 (3.6%)
M Sleep Midpoint 27.16 ± 1.32 26.89 ± 0.73 27.01 ± 1.04 0.368 2 (3.6%)
M Midpoint Weekend 27.38 ± 1.62 26.89 ± 1.24 27.10 ± 1.42 0.256 6 (10.7%)

3.6 Diet

3.6.1 Study 1

# -----------------------------
# Step 0: Define diet variables
# -----------------------------
diet_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",   # Fast food
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",  # Fruit
  "diet3..How.many.servings.of.vegetables.did.you.ea",  # Vegetables
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  # Soda
  "diet5..How.many.times.a.week.did.you.eat.beans..l",  # Beans/Chicken/Fish
  "diet6..How.many.times.a.week.did.you.eat.regular.",  # Salty Snacks
  "diet7..How.many.times.a.week.did.you.eat.desserts",  # Desserts
  "diet8..How.much.margarine..butter..or.meat.fat.do",  # Butter & Meat
  "diet9..How.many.servings.of.fruit.juice.did.you.d"   # Fruit Juice
)

# -----------------------------
# Step 1: Identify unhealthy items
# -----------------------------
unhealthy_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",  
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  
  "diet6..How.many.times.a.week.did.you.eat.regular.",  
  "diet7..How.many.times.a.week.did.you.eat.desserts",  
  "diet8..How.much.margarine..butter..or.meat.fat.do",  
  "diet9..How.many.servings.of.fruit.juice.did.you.d"
)

healthy_vars <- setdiff(diet_vars, unhealthy_vars)

# -----------------------------
# Step 2: Reverse code unhealthy items (0,1,2 → 2,1,0)
# -----------------------------
df_test[unhealthy_vars] <- 2 - df_test[unhealthy_vars]

# -----------------------------
# Step 3: Compute total Diet Score
# -----------------------------
df_test$diet_score <- rowSums(df_test[diet_vars], na.rm = TRUE)

df_test$diet_unhealthy_score <- rowSums(df_test[unhealthy_vars], na.rm = TRUE)

# Optional: scale 0–100
max_score <- length(diet_vars) * 2
df_test$diet_score_0_100 <- df_test$diet_score / max_score * 100

# -----------------------------
# Step 4: Check results
# -----------------------------
#head(df_test[, c(diet_vars, "diet_score", "diet_score_0_100")])

vars_to_plot2 <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",
  "diet3..How.many.servings.of.vegetables.did.you.ea",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",
  "diet_score_0_100",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",
  "diet5..How.many.times.a.week.did.you.eat.beans..l",
  "diet7..How.many.times.a.week.did.you.eat.desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do",
  "diet9..How.many.servings.of.fruit.juice.did.you.d",
  "diet6..How.many.times.a.week.did.you.eat.regular.",
  "dmlpor..When.reflecting.on.your.typical.eating.ha",
  "dmlfrveg..In.a.typical.day..how.many.servings.of.",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons",
  "dmlsugar..When.reflecting.on.your.typical.eating.")

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]

short_names2 <- c(
  "diet_score_0_100" = "Overall Diet Score",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
  "diet3..How.many.servings.of.vegetables.did.you.ea" = "Weekly Vegetables",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac" = "Weekly Fruit",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet" = "Weekly Soda",
  "diet5..How.many.times.a.week.did.you.eat.beans..l"= "Weekly Beans",
  "diet7..How.many.times.a.week.did.you.eat.desserts" =  "Weekly Desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do" =  "Weekly Butter & Meat",
  "diet9..How.many.servings.of.fruit.juice.did.you.d" =  "Weekly Fruit juice",
  "diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
  "dmlpor..When.reflecting.on.your.typical.eating.ha" = "Typical Portion",
  "dmlfrveg..In.a.typical.day..how.many.servings.of." = "Fruit & Veg Intake",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
  "dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake")

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     # <- same function you already created
  data = df_test,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]

big_panel2 <- wrap_plots(plot_list2, ncol = 6)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_test %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_test[[var]], df_test$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_test[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_test), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)

# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 1. Diet variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")
Study 1. Diet variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Weekly Fastfood 1.22 ± 0.75 1.30 ± 0.67 1.27 ± 0.70 0.551 10 (7.2%)
Weekly Vegetables 1.47 ± 0.71 1.40 ± 0.65 1.43 ± 0.67 0.588 10 (7.2%)
Weekly Fruit 1.74 ± 0.52 1.57 ± 0.61 1.64 ± 0.57 0.089 12 (8.7%)
Overall Diet Score 66.04 ± 21.10 60.82 ± 22.44 63.16 ± 21.93 0.162 0 (0%)
Weekly Soda 1.62 ± 0.64 1.66 ± 0.63 1.64 ± 0.64 0.749 10 (7.2%)
Weekly Beans 0.66 ± 0.74 0.51 ± 0.65 0.58 ± 0.69 0.260 10 (7.2%)
Weekly Desserts 1.45 ± 0.65 1.21 ± 0.72 1.32 ± 0.70 0.056 10 (7.2%)
Weekly Butter & Meat 1.21 ± 0.59 1.26 ± 0.61 1.23 ± 0.60 0.635 10 (7.2%)
Weekly Fruit juice 1.83 ± 0.46 1.75 ± 0.47 1.79 ± 0.47 0.374 11 (8%)
Regular Salty Snacks per Week 1.55 ± 0.63 1.27 ± 0.70 1.40 ± 0.68 0.018 * 10 (7.2%)
Typical Portion 6.67 ± 3.01 6.64 ± 2.87 6.65 ± 2.92 0.966 2 (1.4%)
Fruit & Veg Intake 5.40 ± 3.04 5.72 ± 3.02 5.58 ± 3.02 0.538 0 (0%)
Typical Grain Intake 5.65 ± 4.20 5.53 ± 3.88 5.58 ± 4.01 0.864 0 (0%)
Typical Sugar Intake 4.44 ± 3.15 4.41 ± 3.36 4.42 ± 3.26 0.960 0 (0%)

3.6.2 Study 2

# -----------------------------
# Step 0: Define diet variables
# -----------------------------
diet_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",   # Fast food
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",  # Fruit
  "diet3..How.many.servings.of.vegetables.did.you.ea",  # Vegetables
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  # Soda
  "diet5..How.many.times.a.week.did.you.eat.beans..l",  # Beans/Chicken/Fish
  "diet6..How.many.times.a.week.did.you.eat.regular.",  # Salty Snacks
  "diet7..How.many.times.a.week.did.you.eat.desserts",  # Desserts
  "diet8..How.much.margarine..butter..or.meat.fat.do",  # Butter & Meat
  "diet9..How.many.servings.of.fruit.juice.did.you.d")   # Fruit Juice )

# -----------------------------
# Step 1: Identify unhealthy items
# -----------------------------
unhealthy_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",  
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  
  "diet6..How.many.times.a.week.did.you.eat.regular.",  
  "diet7..How.many.times.a.week.did.you.eat.desserts",  
  "diet8..How.much.margarine..butter..or.meat.fat.do",  
  "diet9..How.many.servings.of.fruit.juice.did.you.d")

healthy_vars <- setdiff(diet_vars, unhealthy_vars)

# -----------------------------
# Step 2: Reverse code unhealthy items (0,1,2 → 2,1,0)
# -----------------------------
df_s2[unhealthy_vars] <- 2 - df_s2[unhealthy_vars]

# -----------------------------
# Step 3: Compute total Diet Score
# -----------------------------
df_s2$diet_score <- rowSums(df_s2[diet_vars], na.rm = TRUE)
df_s2$diet_unhealthy_score <- rowSums(df_s2[unhealthy_vars], na.rm = TRUE)

# Optional: scale 0–100
max_score <- length(diet_vars) * 2
df_s2$diet_score_0_100 <- df_s2$diet_score / max_score * 100

# -----------------------------
# Step 4: Variables to plot
# -----------------------------
vars_to_plot2 <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",
  "diet3..How.many.servings.of.vegetables.did.you.ea",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac",
  "diet_score_0_100",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",
  "diet5..How.many.times.a.week.did.you.eat.beans..l",
  "diet7..How.many.times.a.week.did.you.eat.desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do",
  "diet9..How.many.servings.of.fruit.juice.did.you.d",
  "diet6..How.many.times.a.week.did.you.eat.regular.",
  "dmlpor..When.reflecting.on.your.typical.eating.ha",
  "dmlfrveg..In.a.typical.day..how.many.servings.of.",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons",
  "dmlsugar..When.reflecting.on.your.typical.eating."
)

# Keep only variables that exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_s2)]

short_names2 <- c(
  "diet_score_0_100" = "Overall Diet Score",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
  "diet3..How.many.servings.of.vegetables.did.you.ea" = "Weekly Vegetables",
  "diet2..How.many.servings.of.fruit.did.you.eat.eac" = "Weekly Fruit",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet" = "Weekly Soda",
  "diet5..How.many.times.a.week.did.you.eat.beans..l"= "Weekly Beans",
  "diet7..How.many.times.a.week.did.you.eat.desserts" =  "Weekly Desserts",
  "diet8..How.much.margarine..butter..or.meat.fat.do" =  "Weekly Butter & Meat",
  "diet9..How.many.servings.of.fruit.juice.did.you.d" =  "Weekly Fruit juice",
  "diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
  "dmlpor..When.reflecting.on.your.typical.eating.ha" = "Typical Portion",
  "dmlfrveg..In.a.typical.day..how.many.servings.of." = "Fruit & Veg Intake",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
  "dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake"
)

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]

# -----------------------------
# Step 5: Generate plots
# -----------------------------
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,
  data = df_s2,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 6)
big_panel2

# -----------------------------
# Step 6: Summarize numeric vs categorical
# -----------------------------
num_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], function(x) !is.numeric(x))]

# Numeric summaries
num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

# Categorical summaries
cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

# Add description row
description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

# Identify significant rows
p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col <- c("Number of missing values per variable", missing_summary)
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 2. Diet Variables by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")
Study 2. Diet Variables by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Weekly Fastfood 1.44 ± 0.51 1.60 ± 0.58 1.52 ± 0.54 0.303 6 (10.7%)
Weekly Vegetables 1.60 ± 0.50 1.40 ± 0.58 1.50 ± 0.54 0.197 6 (10.7%)
Weekly Fruit 1.56 ± 0.58 1.75 ± 0.53 1.65 ± 0.56 0.239 7 (12.5%)
Overall Diet Score 66.67 ± 17.43 62.59 ± 29.68 64.48 ± 24.63 0.528 0 (0%)
Weekly Soda 1.68 ± 0.56 1.88 ± 0.33 1.78 ± 0.46 0.131 6 (10.7%)
Weekly Beans 0.68 ± 0.69 0.40 ± 0.58 0.54 ± 0.65 0.127 6 (10.7%)
Weekly Desserts 1.20 ± 0.76 1.52 ± 0.65 1.36 ± 0.72 0.118 6 (10.7%)
Weekly Butter & Meat 1.32 ± 0.48 1.48 ± 0.59 1.40 ± 0.53 0.295 6 (10.7%)
Weekly Fruit juice 1.52 ± 0.65 1.88 ± 0.33 1.70 ± 0.54 0.019 * 6 (10.7%)
Regular Salty Snacks per Week 1.48 ± 0.65 1.68 ± 0.48 1.58 ± 0.57 0.223 6 (10.7%)
Typical Portion 6.15 ± 2.94 7.33 ± 3.14 6.79 ± 3.08 0.153 0 (0%)
Fruit & Veg Intake 5.77 ± 2.32 5.52 ± 2.79 5.64 ± 2.56 0.716 1 (1.8%)
Typical Grain Intake 7.31 ± 3.80 7.17 ± 3.13 7.23 ± 3.43 0.881 0 (0%)
Typical Sugar Intake 3.65 ± 3.02 5.17 ± 3.07 4.46 ± 3.12 0.069 0 (0%)

3.7 HR/HRV

3.7.1 Study 1

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_test)]

df_test$hrv_rmssd_clean <- pmin(
  pmax(df_test$hrv_rmssd, 5),  # lower bound
  100                          # upper bound
)

num_changed_rmssd <- sum(df_test$hrv_rmssd_clean != df_test$hrv_rmssd, na.rm = TRUE)

vars_to_plot2 <- c(
  "hr_from_ecg",
  "hrv_rmssd_clean",
  "stress_overall_daily_mean_stress")
short_names2 <- c(
  "hr_from_ecg" = "Resting HR (ECG)",
  "hrv_rmssd_clean" = "HRV RMSSD (ECG)",
  "stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)")

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     # <- same function you already created
  data = df_test,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]

big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_test[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_test %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_test[[var]], df_test$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
# ----------------------------
# Compute missing counts for the variables actually plotted
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_test[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_test), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

# Add description for the first row
missing_col <- c("Number of missing values per variable", missing_summary)

# Extend final_tableB2 to have the correct length
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 1. HR/HRV by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")
Study 1. HR/HRV by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Resting HR (ECG) 69.03 ± 11.47 64.99 ± 11.84 66.83 ± 11.80 0.049 * 6 (4.3%)
HRV RMSSD (ECG) 32.56 ± 26.10 35.84 ± 26.35 34.35 ± 26.19 0.476 6 (4.3%)
Daily Stress score (Garmin) 27.76 ± 13.97 24.34 ± 13.48 25.88 ± 13.76 0.149 0 (0%)

3.8 Study 2

# Keep only those that actually exist
vars_to_plot2 <- vars_to_plot2[vars_to_plot2 %in% names(df_s2)]

df_s2$hrv_rmssd_clean <- pmin(
  pmax(df_s2$hrv_rmssd, 5),  # lower bound
  100                         # upper bound
)

num_changed_rmssd <- sum(df_s2$hrv_rmssd_clean != df_s2$hrv_rmssd, na.rm = TRUE)

vars_to_plot2 <- c(
  "hr_from_ecg",
  "hrv_rmssd_clean",
  "stress_overall_daily_mean_stress"
)

short_names2 <- c(
  "hr_from_ecg" = "Resting HR (ECG)",
  "hrv_rmssd_clean" = "HRV RMSSD (ECG)",
  "stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)"
)

short_names2 <- short_names2[names(short_names2) %in% vars_to_plot2]
plot_list2 <- lapply(
  vars_to_plot2,
  plot_ir_variable,     # <- same function you already created
  data = df_s2,
  short_names = short_names2,
  fill_colors = fill_colors,
  edge_colors = edge_colors
)

plot_list2 <- plot_list2[!sapply(plot_list2, is.null)]
big_panel2 <- wrap_plots(plot_list2, ncol = 4)
big_panel2

num_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], is.numeric)]
cat_vars2 <- vars_to_plot2[sapply(df_s2[vars_to_plot2], function(x) !is.numeric(x))]

num_summary2 <- lapply(num_vars2, function(var) {
  data <- df_s2 %>% filter(!is.na(.data[[var]]))
  IR_vals <- data %>% filter(IR_label == "IR") %>% pull(var)
  NonIR_vals <- data %>% filter(IR_label == "Non-IR") %>% pull(var)
  Total_vals <- data[[var]]
  
  t_res <- tryCatch(t.test(IR_vals, NonIR_vals), error = function(e) NULL)
  p_raw <- if (!is.null(t_res)) t_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  tibble(
    Variable = short_names2[[var]],
    IR = sprintf("%.2f ± %.2f", mean(IR_vals), sd(IR_vals)),
    Non_IR = sprintf("%.2f ± %.2f", mean(NonIR_vals), sd(NonIR_vals)),
    Total   = sprintf("%.2f ± %.2f", mean(Total_vals), sd(Total_vals)),
    `p-value` = p_val
  )
}) %>% bind_rows()

cat_summary2 <- lapply(cat_vars2, function(var) {
  tbl <- table(df_s2[[var]], df_s2$IR_label)
  IR_total <- sum(tbl[, "IR"])
  NonIR_total <- sum(tbl[, "Non-IR"])
  total_total <- sum(tbl)
  
  chisq_res <- tryCatch(chisq.test(tbl), error = function(e) NULL)
  p_raw <- if (!is.null(chisq_res)) chisq_res$p.value else NA
  
  p_val <- if (!is.na(p_raw)) {
    if (p_raw < 0.001) "<0.001" else formatC(p_raw, digits = 3, format = "f")
  } else "–"
  
  IR_str    <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[2],   100*x[2]/IR_total))
  NonIR_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", x[1],   100*x[1]/NonIR_total))
  Total_str <- apply(tbl, 1, function(x) sprintf("%d (%.1f%%)", sum(x), 100*sum(x)/total_total))
  
  tibble(
    Variable = short_names2[[var]],
    IR       = paste(IR_str, collapse = "; "),
    Non_IR   = paste(NonIR_str, collapse = "; "),
    Total    = paste(Total_str, collapse = "; "),
    `p-value` = p_val
  )
}) %>% bind_rows()

final_tableB2 <- bind_rows(num_summary2, cat_summary2)

final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

description_rowB <- tibble(
  Variable = "Note:",
  IR = "IR = mean ± SD or count (%)",
  Non_IR = "Non-IR = mean ± SD or count (%)",
  Total = "Total = mean ± SD or count (%)",
  `p-value` = "t-test or χ² test"
)

final_tableB2 <- bind_rows(description_rowB, final_tableB2)
final_tableB2$`p-value` <- sapply(final_tableB2$`p-value`, add_stars)

p_char2  <- gsub("[^0-9\\.eE+-]", "", final_tableB2$`p-value`)
p_numeric2 <- as.numeric(gsub("<", "-", p_char2))
sig_rows2 <- which(!is.na(p_numeric2) & p_numeric2 < 0.05) + 1

# Compute missing counts per variable
missing_summary <- sapply(vars_to_plot2, function(var) {
  n_missing <- sum(is.na(df_s2[[var]]))
  pct_missing <- round(100 * n_missing / nrow(df_s2), 1)
  paste0(n_missing, " (", pct_missing, "%)")
})

missing_col <- c("Number of missing values per variable", missing_summary)
final_tableB2$`Missing (n, %)` <- missing_col

# Render the table
kbl(final_tableB2, "html", 
    caption = "Study 2. HR/HRV by IR Status",
    escape = FALSE) %>%
  kable_styling(full_width = TRUE, position = "center") %>%
  row_spec(0, bold = TRUE) %>% 
  row_spec(1, italic = TRUE, background = "#F0F0F0") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(5, italic = TRUE) %>%
  row_spec(sig_rows2, background = "#F5FBFF")
Study 2. HR/HRV by IR Status
Variable IR Non_IR Total p-value Missing (n, %)
Note: IR = mean ± SD or count (%) Non-IR = mean ± SD or count (%) Total = mean ± SD or count (%) t-test or χ² test Number of missing values per variable
Resting HR (ECG) 64.38 ± 11.94 56.83 ± 6.07 60.40 ± 9.98 0.006 ** 1 (1.8%)
HRV RMSSD (ECG) 33.94 ± 31.56 30.25 ± 23.12 32.00 ± 27.24 0.626 1 (1.8%)
Daily Stress score (Garmin) 19.44 ± 13.86 22.33 ± 11.69 20.96 ± 12.73 0.410 1 (1.8%)

4 Results

5 Model development

5.0.1 Features

library(tidyverse)
library(knitr)
library(kableExtra)

# -------------------------------
# 1. Define subset of features
# -------------------------------

X <- df_test %>%
  dplyr::select(
    fh_dm2sb_cat,
    years_of_education, 
    age, 
    bmi_vsorres..BMI, 
    whr_vsorres..Waist.to.Hip.Ratio..WHR.,
    pulse_vsorres..Heart.Rate..bpm.,
    sualcage, 
    sualckncf,
    sri,
    mean_pct_steps_morning,
    mean_pct_steps_evening,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    hr_from_ecg,
    mean_active_minutes,
    diet6..How.many.times.a.week.did.you.eat.regular.,
    diet1..How.many.times.a.week.did.you.eat.fast.foo,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    cv_total_steps,
    stress_overall_daily_mean_stress
  ) %>%
  mutate(across(where(is.factor), ~ factor(trimws(.))))

selected_features <- colnames(X)

# -------------------------------
# 2. Feature categories
# -------------------------------

feature_categories <- tibble(
  Feature = selected_features,
  Category = c(
    "Demographics",           # fh_dm2sb_cat
    "Demographics",           # years_of_education
    "Demographics",           # age
    "Anthropometrics & Vitals", # bmi_vsorres..BMI
    "Anthropometrics & Vitals", # whr_vsorres..Waist.to.Hip.Ratio..WHR.
    "Anthropometrics & Vitals", # pulse_vsorres..Heart.Rate..bpm.
    "Alcohol / Substance Use",  # sualcage
    "Alcohol / Substance Use",  # sualckncf
    "Sleep",                    # sri
    "Physical Activity",        # mean_pct_steps_morning
    "Physical Activity",        # mean_pct_steps_evening
    "Physical Activity",        # dmlhex..How.often.would.you.say.that.you.engage.i
    "Physical Activity",        # dmlvex..In.a.typical.week..how.often.do.you.engag
    "ECG / Heart Rate & HRV",   # hr_from_ecg
    "Physical Activity",        # mean_active_minutes
    "Nutrition",                # diet6..How.many.times.a.week.did.you.eat.regular.
    "Nutrition",                # diet1..How.many.times.a.week.did.you.eat.fast.foo
    "Nutrition",                # dmlgrain..In.a.typical.day..how.often.do.you.cons
    "Nutrition",                # dmlsugar..When.reflecting.on.your.typical.eating.
    "Physical Activity",        # cv_total_steps
    "Stress"                    # stress_overall_daily_mean_stress
  )
)

# -------------------------------
# 3. Short names dictionary
# -------------------------------

short_names_all <- c(
  "bmi_vsorres..BMI" = "BMI",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR." = "WaistHip_Ratio",
  "pulse_vsorres..Heart.Rate..bpm." = "PulseHR",
  "age" = "Age",
  "fh_dm2sb_cat" = "Sibling T2D history",
  "years_of_education" = "Years of education",
  "sualcage" = "Total years drinking",
  "sualckncf" = "Wine in past year",
  "sri" = "Sleep Reg. Index",
  "mean_pct_steps_morning" = "Pct Steps Morning",
  "mean_pct_steps_evening" = "Pct Steps Evening",
  "dmlhex..How.often.would.you.say.that.you.engage.i" = "Home Exercise Frequency",
  "dmlvex..In.a.typical.week..how.often.do.you.engag" = "Vig Exercise Frequency",
  "hr_from_ecg" = "Resting HR (ECG)",
  "mean_active_minutes" = "Mean Active Minutes",
  "diet6..How.many.times.a.week.did.you.eat.regular." = "Regular Salty Snacks per Week",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo" = "Weekly Fastfood",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons" = "Typical Grain Intake",
  "dmlsugar..When.reflecting.on.your.typical.eating." = "Typical Sugar Intake",
  "cv_total_steps" = "CV Total Steps",
  "stress_overall_daily_mean_stress" = "Daily Stress score (Garmin)"
)

short_names_tbl <- tibble(
  Feature = names(short_names_all),
  Short_Name = unname(short_names_all)
)

# -------------------------------
# 4. Combine category + name (2 columns)
# -------------------------------

final_table <- feature_categories %>%
  left_join(short_names_tbl, by = "Feature") %>%
  mutate(`Feature / Short Name` = coalesce(Short_Name, Feature)) %>%
  select(Category, `Feature / Short Name`) %>%
  arrange(Category, `Feature / Short Name`)

# -------------------------------
# 5. Render 2-column table
# -------------------------------

kable(
  final_table,
  caption = "Selected Features by Category",
  col.names = c("Category", "Feature / Short Name")
) %>%
  kable_classic(full_width = TRUE)
Selected Features by Category
Category Feature / Short Name
Alcohol / Substance Use Total years drinking
Alcohol / Substance Use Wine in past year
Anthropometrics & Vitals BMI
Anthropometrics & Vitals PulseHR
Anthropometrics & Vitals WaistHip_Ratio
Demographics Age
Demographics Sibling T2D history
Demographics Years of education
ECG / Heart Rate & HRV Resting HR (ECG)
Nutrition Regular Salty Snacks per Week
Nutrition Typical Grain Intake
Nutrition Typical Sugar Intake
Nutrition Weekly Fastfood
Physical Activity CV Total Steps
Physical Activity Home Exercise Frequency
Physical Activity Mean Active Minutes
Physical Activity Pct Steps Evening
Physical Activity Pct Steps Morning
Physical Activity Vig Exercise Frequency
Sleep Sleep Reg. Index
Stress Daily Stress score (Garmin)
library(ggplot2)
library(viridis)

# Create feature counts table
feature_counts <- feature_categories %>%
  group_by(Category) %>%
  summarise(Count = n()) %>%
  ungroup() %>%
  mutate(Percent = round(100 * Count / sum(Count), 1))

# Wide horizontal stacked bar plot
ggplot(feature_counts, aes(x = 1, y = Count, fill = Category)) +
  geom_bar(stat = "identity", width = 0.6, color = "white") +
  geom_text(aes(label = paste0(Count, " (", Percent, "%)")), 
            position = position_stack(vjust = 0.5), color = "black", size = 5) +
  labs(title = "Feature Counts by Category", x = NULL, y = "Number of Features") +
  coord_flip(expand = FALSE) +
  scale_fill_viridis(discrete = TRUE, option = "C") +  # modern, contrasting colors
  theme_minimal(base_size = 12) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.y = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "right"
  ) +
  # Make plot very wide and not tall
  theme(aspect.ratio = 0.25)

5.0.2 Feature Correlations

# Select your variables
df_selected <- df_test %>%
  dplyr::select(
    years_of_education, 
    age, 
    bmi_vsorres..BMI, 
    whr_vsorres..Waist.to.Hip.Ratio..WHR.,
    pulse_vsorres..Heart.Rate..bpm.,
    sualcage, 
    sualckncf,
    sri,
    mean_pct_steps_morning,
    mean_pct_steps_afternoon,
    mean_pct_steps_evening,
    dmlhex..How.often.would.you.say.that.you.engage.i,
    dmlact..What.would.you.consider.your.typical.acti,
    dmlvex..In.a.typical.week..how.often.do.you.engag,
    hr_from_ecg,
    diet7..How.many.times.a.week.did.you.eat.desserts,
    diet6..How.many.times.a.week.did.you.eat.regular.,
    dmlpor..When.reflecting.on.your.typical.eating.ha,
    dmlfrveg..In.a.typical.day..how.many.servings.of.,
    dmlgrain..In.a.typical.day..how.often.do.you.cons,
    dmlsugar..When.reflecting.on.your.typical.eating.,
    sd_wake_clock_h,
    sd_onset_clock_h,
    cv_total_steps,
    stress_overall_daily_mean_stress
  )

feature_categories <- tidyr::tibble(
  Feature = colnames(df_selected),
  Category = c(
    "Demographics", "Demographics",  # years_of_education, age
    "Anthropometrics & Vitals", "Anthropometrics & Vitals", "Anthropometrics & Vitals", # BMI, WHR, Pulse
    "Alcohol / Substance Use", "Alcohol / Substance Use", # sualcage, sualckncf
    "Stress", # sri
    "ECG / Heart Rate & HRV", "ECG / Heart Rate & HRV", 
    "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", "Physical Activity", # steps and activity features
    "Sleep", "Sleep", # sd_wake_clock_h, sd_onset_clock_h
    "Nutrition", "Nutrition", "Nutrition", "Nutrition", "Nutrition", "Nutrition" # diet features
  )
)

# ------------------------------
# 2. Count features per category
# ------------------------------
feature_counts <- feature_categories %>%
  group_by(Category) %>%
  summarise(Count = n(), .groups = "drop") %>%
  mutate(Percent = round(Count / sum(Count) * 100, 1))

# ------------------------------
# 3. Stacked bar plot
# ------------------------------
ggplot(feature_counts, aes(x = "", y = Count, fill = Category)) +
  geom_bar(stat = "identity", width = 0.7) +  # thicker bars
  geom_text(aes(label = paste0(Count, " (", Percent, "%)")), 
            position = position_stack(vjust = 0.5), color = "black", size = 5) +
  labs(title = "Feature Counts by Category", x = "", y = "Number of Features") +
  coord_flip(expand = FALSE) +  # remove extra padding for wider look
  scale_fill_brewer(palette = "Blues") +
  theme_classic(base_size = 10) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

# Keep only numeric columns
df_numeric <- df_selected %>%
  select(where(is.numeric))

# Compute correlation matrix
cor_matrix <- cor(df_numeric, use = "pairwise.complete.obs")

# 1. Compute correlation matrix
cor_matrix <- cor(df_numeric, use = "pairwise.complete.obs")
cor_matrix[is.na(cor_matrix)] <- 0  # handle zero-variance columns

# 2. Convert to long table
cor_table <- as.data.frame(as.table(cor_matrix)) %>%
  filter(Var1 != Var2)  # remove self-correlations

# 3. Remove duplicate pairs (A,B) == (B,A)
cor_table_unique <- cor_table %>%
  rowwise() %>%
  mutate(pair = paste(sort(c(Var1, Var2)), collapse = "_")) %>%
  distinct(pair, .keep_all = TRUE) %>%
  ungroup() %>%
  dplyr::select(Var1, Var2, Freq)

# 4. Rename Freq to Correlation
cor_table_unique <- cor_table_unique %>%
  dplyr::rename(Correlation = Freq)

# 5. Sort descending by absolute correlation
cor_table_unique <- cor_table_unique %>%
  dplyr::arrange(desc(abs(Correlation)))

# 6. Print table with kable
knitr::kable(cor_table_unique, digits = 2, caption = "All Correlations (Sorted by |Correlation|)")
All Correlations (Sorted by |Correlation|)
Var1 Var2 Correlation
hr_from_ecg pulse_vsorres..Heart.Rate..bpm. 0.75
cv_total_steps mean_pct_steps_afternoon -0.63
dmlsugar..When.reflecting.on.your.typical.eating. diet7..How.many.times.a.week.did.you.eat.desserts 0.62
cv_total_steps mean_pct_steps_evening -0.59
sd_onset_clock_h sd_wake_clock_h 0.51
stress_overall_daily_mean_stress hr_from_ecg 0.51
dmlvex..In.a.typical.week..how.often.do.you.engag dmlact..What.would.you.consider.your.typical.acti 0.50
dmlact..What.would.you.consider.your.typical.acti dmlhex..How.often.would.you.say.that.you.engage.i 0.49
cv_total_steps mean_pct_steps_morning -0.49
stress_overall_daily_mean_stress mean_pct_steps_afternoon 0.48
stress_overall_daily_mean_stress pulse_vsorres..Heart.Rate..bpm. 0.46
stress_overall_daily_mean_stress cv_total_steps -0.46
mean_pct_steps_afternoon mean_pct_steps_morning 0.41
dmlfrveg..In.a.typical.day..how.many.servings.of. dmlhex..How.often.would.you.say.that.you.engage.i 0.40
diet6..How.many.times.a.week.did.you.eat.regular. diet7..How.many.times.a.week.did.you.eat.desserts 0.38
diet6..How.many.times.a.week.did.you.eat.regular. dmlvex..In.a.typical.week..how.often.do.you.engag 0.35
dmlact..What.would.you.consider.your.typical.acti bmi_vsorres..BMI -0.34
dmlvex..In.a.typical.week..how.often.do.you.engag dmlhex..How.often.would.you.say.that.you.engage.i 0.34
dmlhex..How.often.would.you.say.that.you.engage.i bmi_vsorres..BMI -0.33
dmlfrveg..In.a.typical.day..how.many.servings.of. dmlvex..In.a.typical.week..how.often.do.you.engag 0.31
sd_wake_clock_h mean_pct_steps_morning -0.30
stress_overall_daily_mean_stress mean_pct_steps_evening 0.29
stress_overall_daily_mean_stress mean_pct_steps_morning 0.29
dmlfrveg..In.a.typical.day..how.many.servings.of. dmlact..What.would.you.consider.your.typical.acti 0.29
dmlgrain..In.a.typical.day..how.often.do.you.cons years_of_education 0.28
dmlgrain..In.a.typical.day..how.often.do.you.cons dmlpor..When.reflecting.on.your.typical.eating.ha 0.27
dmlgrain..In.a.typical.day..how.often.do.you.cons diet6..How.many.times.a.week.did.you.eat.regular. 0.27
dmlsugar..When.reflecting.on.your.typical.eating. diet6..How.many.times.a.week.did.you.eat.regular. 0.27
sd_wake_clock_h mean_pct_steps_evening 0.26
dmlvex..In.a.typical.week..how.often.do.you.engag bmi_vsorres..BMI -0.26
dmlgrain..In.a.typical.day..how.often.do.you.cons dmlfrveg..In.a.typical.day..how.many.servings.of. 0.26
sd_wake_clock_h hr_from_ecg 0.26
dmlsugar..When.reflecting.on.your.typical.eating. dmlvex..In.a.typical.week..how.often.do.you.engag 0.26
mean_pct_steps_evening mean_pct_steps_afternoon 0.25
mean_pct_steps_evening age -0.25
bmi_vsorres..BMI age -0.25
dmlfrveg..In.a.typical.day..how.many.servings.of. dmlpor..When.reflecting.on.your.typical.eating.ha 0.24
stress_overall_daily_mean_stress sd_onset_clock_h 0.24
mean_pct_steps_afternoon bmi_vsorres..BMI 0.24
stress_overall_daily_mean_stress bmi_vsorres..BMI 0.24
mean_pct_steps_morning sualcage 0.24
sd_wake_clock_h years_of_education -0.23
dmlsugar..When.reflecting.on.your.typical.eating. dmlpor..When.reflecting.on.your.typical.eating.ha 0.23
sd_onset_clock_h mean_pct_steps_evening 0.23
stress_overall_daily_mean_stress dmlact..What.would.you.consider.your.typical.acti -0.23
sd_onset_clock_h pulse_vsorres..Heart.Rate..bpm. 0.23
hr_from_ecg bmi_vsorres..BMI 0.22
diet6..How.many.times.a.week.did.you.eat.regular. dmlhex..How.often.would.you.say.that.you.engage.i 0.22
pulse_vsorres..Heart.Rate..bpm. bmi_vsorres..BMI 0.22
stress_overall_daily_mean_stress dmlvex..In.a.typical.week..how.often.do.you.engag -0.22
sd_onset_clock_h hr_from_ecg 0.22
dmlfrveg..In.a.typical.day..how.many.servings.of. years_of_education 0.22
hr_from_ecg sualckncf 0.22
bmi_vsorres..BMI years_of_education -0.22
mean_pct_steps_morning age 0.21
hr_from_ecg age -0.21
dmlsugar..When.reflecting.on.your.typical.eating. dmlgrain..In.a.typical.day..how.often.do.you.cons 0.21
hr_from_ecg dmlact..What.would.you.consider.your.typical.acti -0.20
dmlgrain..In.a.typical.day..how.often.do.you.cons dmlhex..How.often.would.you.say.that.you.engage.i 0.20
sri age -0.20
sd_onset_clock_h sualcage 0.20
hr_from_ecg dmlhex..How.often.would.you.say.that.you.engage.i -0.20
hr_from_ecg dmlvex..In.a.typical.week..how.often.do.you.engag -0.19
pulse_vsorres..Heart.Rate..bpm. age -0.19
dmlhex..How.often.would.you.say.that.you.engage.i pulse_vsorres..Heart.Rate..bpm. -0.19
dmlfrveg..In.a.typical.day..how.many.servings.of. bmi_vsorres..BMI -0.19
stress_overall_daily_mean_stress sualcage 0.18
dmlact..What.would.you.consider.your.typical.acti sualckncf -0.18
dmlgrain..In.a.typical.day..how.often.do.you.cons dmlact..What.would.you.consider.your.typical.acti 0.18
cv_total_steps age 0.18
dmlvex..In.a.typical.week..how.often.do.you.engag sri 0.18
dmlgrain..In.a.typical.day..how.often.do.you.cons diet7..How.many.times.a.week.did.you.eat.desserts 0.18
stress_overall_daily_mean_stress sualckncf 0.18
dmlact..What.would.you.consider.your.typical.acti years_of_education 0.17
diet6..How.many.times.a.week.did.you.eat.regular. dmlact..What.would.you.consider.your.typical.acti 0.17
dmlsugar..When.reflecting.on.your.typical.eating. sri 0.17
stress_overall_daily_mean_stress dmlhex..How.often.would.you.say.that.you.engage.i -0.17
dmlhex..How.often.would.you.say.that.you.engage.i sualckncf -0.17
stress_overall_daily_mean_stress years_of_education -0.17
cv_total_steps sd_onset_clock_h -0.17
age years_of_education 0.17
dmlhex..How.often.would.you.say.that.you.engage.i whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.16
stress_overall_daily_mean_stress dmlfrveg..In.a.typical.day..how.many.servings.of. -0.16
sri years_of_education 0.16
dmlpor..When.reflecting.on.your.typical.eating.ha dmlhex..How.often.would.you.say.that.you.engage.i 0.16
cv_total_steps sualcage -0.16
dmlfrveg..In.a.typical.day..how.many.servings.of. hr_from_ecg -0.16
dmlfrveg..In.a.typical.day..how.many.servings.of. sualckncf -0.16
diet7..How.many.times.a.week.did.you.eat.desserts sri 0.16
dmlfrveg..In.a.typical.day..how.many.servings.of. whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.15
dmlvex..In.a.typical.week..how.often.do.you.engag years_of_education 0.15
diet7..How.many.times.a.week.did.you.eat.desserts dmlvex..In.a.typical.week..how.often.do.you.engag 0.15
sd_onset_clock_h dmlsugar..When.reflecting.on.your.typical.eating. -0.15
sd_onset_clock_h years_of_education -0.15
sd_onset_clock_h dmlact..What.would.you.consider.your.typical.acti -0.15
sd_onset_clock_h age -0.15
dmlfrveg..In.a.typical.day..how.many.servings.of. pulse_vsorres..Heart.Rate..bpm. -0.15
sd_onset_clock_h bmi_vsorres..BMI 0.15
dmlgrain..In.a.typical.day..how.often.do.you.cons dmlvex..In.a.typical.week..how.often.do.you.engag 0.15
diet7..How.many.times.a.week.did.you.eat.desserts mean_pct_steps_afternoon 0.15
dmlvex..In.a.typical.week..how.often.do.you.engag pulse_vsorres..Heart.Rate..bpm. -0.15
dmlvex..In.a.typical.week..how.often.do.you.engag sualcage 0.15
diet7..How.many.times.a.week.did.you.eat.desserts years_of_education 0.14
dmlsugar..When.reflecting.on.your.typical.eating. dmlact..What.would.you.consider.your.typical.acti 0.14
stress_overall_daily_mean_stress sd_wake_clock_h 0.14
dmlvex..In.a.typical.week..how.often.do.you.engag mean_pct_steps_afternoon -0.14
sd_wake_clock_h bmi_vsorres..BMI 0.14
sualcage whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.14
hr_from_ecg years_of_education -0.14
mean_pct_steps_evening years_of_education -0.14
dmlsugar..When.reflecting.on.your.typical.eating. pulse_vsorres..Heart.Rate..bpm. -0.14
dmlfrveg..In.a.typical.day..how.many.servings.of. sri 0.13
sd_wake_clock_h diet7..How.many.times.a.week.did.you.eat.desserts 0.13
diet6..How.many.times.a.week.did.you.eat.regular. mean_pct_steps_afternoon 0.13
hr_from_ecg whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.13
pulse_vsorres..Heart.Rate..bpm. years_of_education -0.13
dmlhex..How.often.would.you.say.that.you.engage.i years_of_education 0.13
dmlgrain..In.a.typical.day..how.often.do.you.cons pulse_vsorres..Heart.Rate..bpm. -0.13
mean_pct_steps_afternoon pulse_vsorres..Heart.Rate..bpm. 0.13
sd_onset_clock_h dmlhex..How.often.would.you.say.that.you.engage.i -0.13
dmlvex..In.a.typical.week..how.often.do.you.engag sualckncf -0.13
dmlact..What.would.you.consider.your.typical.acti pulse_vsorres..Heart.Rate..bpm. -0.13
dmlsugar..When.reflecting.on.your.typical.eating. dmlfrveg..In.a.typical.day..how.many.servings.of. 0.13
dmlfrveg..In.a.typical.day..how.many.servings.of. sualcage 0.13
sd_wake_clock_h pulse_vsorres..Heart.Rate..bpm. 0.13
dmlpor..When.reflecting.on.your.typical.eating.ha whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.12
dmlact..What.would.you.consider.your.typical.acti whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.12
dmlsugar..When.reflecting.on.your.typical.eating. years_of_education 0.12
dmlpor..When.reflecting.on.your.typical.eating.ha dmlact..What.would.you.consider.your.typical.acti 0.12
dmlhex..How.often.would.you.say.that.you.engage.i age 0.12
dmlact..What.would.you.consider.your.typical.acti age 0.12
sd_onset_clock_h dmlgrain..In.a.typical.day..how.often.do.you.cons -0.11
dmlfrveg..In.a.typical.day..how.many.servings.of. diet6..How.many.times.a.week.did.you.eat.regular. 0.11
dmlsugar..When.reflecting.on.your.typical.eating. hr_from_ecg -0.11
stress_overall_daily_mean_stress sri -0.11
diet7..How.many.times.a.week.did.you.eat.desserts dmlhex..How.often.would.you.say.that.you.engage.i 0.11
stress_overall_daily_mean_stress dmlsugar..When.reflecting.on.your.typical.eating. -0.11
sd_wake_clock_h age -0.11
mean_pct_steps_morning years_of_education 0.11
dmlgrain..In.a.typical.day..how.often.do.you.cons bmi_vsorres..BMI -0.11
whr_vsorres..Waist.to.Hip.Ratio..WHR. age 0.11
dmlgrain..In.a.typical.day..how.often.do.you.cons mean_pct_steps_morning 0.11
hr_from_ecg mean_pct_steps_morning -0.10
dmlpor..When.reflecting.on.your.typical.eating.ha mean_pct_steps_morning -0.10
sd_onset_clock_h sualckncf -0.10
cv_total_steps dmlsugar..When.reflecting.on.your.typical.eating. 0.10
diet7..How.many.times.a.week.did.you.eat.desserts hr_from_ecg -0.10
dmlfrveg..In.a.typical.day..how.many.servings.of. mean_pct_steps_afternoon -0.10
dmlfrveg..In.a.typical.day..how.many.servings.of. diet7..How.many.times.a.week.did.you.eat.desserts 0.10
cv_total_steps dmlact..What.would.you.consider.your.typical.acti 0.10
cv_total_steps bmi_vsorres..BMI -0.10
mean_pct_steps_morning bmi_vsorres..BMI 0.10
hr_from_ecg sri -0.10
hr_from_ecg mean_pct_steps_evening 0.10
dmlpor..When.reflecting.on.your.typical.eating.ha pulse_vsorres..Heart.Rate..bpm. -0.10
stress_overall_daily_mean_stress age -0.10
hr_from_ecg mean_pct_steps_afternoon 0.10
sd_wake_clock_h dmlsugar..When.reflecting.on.your.typical.eating. -0.10
mean_pct_steps_afternoon sri -0.10
dmlpor..When.reflecting.on.your.typical.eating.ha dmlvex..In.a.typical.week..how.often.do.you.engag 0.10
dmlact..What.would.you.consider.your.typical.acti sri 0.09
cv_total_steps whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.09
cv_total_steps diet7..How.many.times.a.week.did.you.eat.desserts -0.09
stress_overall_daily_mean_stress diet7..How.many.times.a.week.did.you.eat.desserts 0.09
sri sualcage 0.09
sd_wake_clock_h dmlvex..In.a.typical.week..how.often.do.you.engag -0.09
sd_wake_clock_h dmlact..What.would.you.consider.your.typical.acti -0.09
cv_total_steps sri -0.09
sd_wake_clock_h diet6..How.many.times.a.week.did.you.eat.regular. 0.09
dmlvex..In.a.typical.week..how.often.do.you.engag age 0.09
dmlsugar..When.reflecting.on.your.typical.eating. mean_pct_steps_evening -0.09
sualckncf whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.09
sualckncf bmi_vsorres..BMI 0.09
sd_wake_clock_h mean_pct_steps_afternoon 0.09
diet6..How.many.times.a.week.did.you.eat.regular. years_of_education 0.08
diet7..How.many.times.a.week.did.you.eat.desserts pulse_vsorres..Heart.Rate..bpm. -0.08
mean_pct_steps_evening pulse_vsorres..Heart.Rate..bpm. 0.08
diet6..How.many.times.a.week.did.you.eat.regular. hr_from_ecg -0.08
dmlpor..When.reflecting.on.your.typical.eating.ha diet7..How.many.times.a.week.did.you.eat.desserts 0.08
dmlact..What.would.you.consider.your.typical.acti mean_pct_steps_afternoon -0.08
mean_pct_steps_afternoon years_of_education -0.08
diet6..How.many.times.a.week.did.you.eat.regular. sualcage -0.08
cv_total_steps dmlhex..How.often.would.you.say.that.you.engage.i -0.08
whr_vsorres..Waist.to.Hip.Ratio..WHR. years_of_education -0.08
mean_pct_steps_evening sualcage 0.08
sd_onset_clock_h dmlfrveg..In.a.typical.day..how.many.servings.of. 0.08
diet6..How.many.times.a.week.did.you.eat.regular. mean_pct_steps_evening 0.08
mean_pct_steps_evening whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.07
dmlhex..How.often.would.you.say.that.you.engage.i sri 0.07
stress_overall_daily_mean_stress dmlgrain..In.a.typical.day..how.often.do.you.cons -0.07
cv_total_steps pulse_vsorres..Heart.Rate..bpm. -0.07
dmlhex..How.often.would.you.say.that.you.engage.i mean_pct_steps_evening 0.07
sri whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.07
dmlpor..When.reflecting.on.your.typical.eating.ha sualcage 0.07
diet6..How.many.times.a.week.did.you.eat.regular. sri -0.07
dmlsugar..When.reflecting.on.your.typical.eating. dmlhex..How.often.would.you.say.that.you.engage.i 0.07
dmlsugar..When.reflecting.on.your.typical.eating. bmi_vsorres..BMI -0.07
sualcage pulse_vsorres..Heart.Rate..bpm. -0.07
mean_pct_steps_evening sualckncf -0.07
cv_total_steps dmlgrain..In.a.typical.day..how.often.do.you.cons 0.07
sd_onset_clock_h mean_pct_steps_morning -0.07
dmlpor..When.reflecting.on.your.typical.eating.ha mean_pct_steps_evening -0.06
dmlsugar..When.reflecting.on.your.typical.eating. mean_pct_steps_morning -0.06
dmlpor..When.reflecting.on.your.typical.eating.ha diet6..How.many.times.a.week.did.you.eat.regular. 0.06
diet6..How.many.times.a.week.did.you.eat.regular. mean_pct_steps_morning 0.06
dmlact..What.would.you.consider.your.typical.acti sualcage 0.06
cv_total_steps dmlpor..When.reflecting.on.your.typical.eating.ha 0.06
dmlpor..When.reflecting.on.your.typical.eating.ha age 0.06
dmlvex..In.a.typical.week..how.often.do.you.engag whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.06
dmlfrveg..In.a.typical.day..how.many.servings.of. mean_pct_steps_morning -0.06
sualckncf pulse_vsorres..Heart.Rate..bpm. 0.06
sri bmi_vsorres..BMI -0.06
dmlgrain..In.a.typical.day..how.often.do.you.cons sualckncf 0.06
sualcage bmi_vsorres..BMI 0.06
dmlgrain..In.a.typical.day..how.often.do.you.cons age 0.06
sd_onset_clock_h dmlpor..When.reflecting.on.your.typical.eating.ha 0.06
dmlgrain..In.a.typical.day..how.often.do.you.cons mean_pct_steps_evening -0.06
sri pulse_vsorres..Heart.Rate..bpm. -0.06
pulse_vsorres..Heart.Rate..bpm. whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.06
mean_pct_steps_morning sri -0.06
sd_wake_clock_h sualckncf -0.06
sd_wake_clock_h sri -0.06
dmlsugar..When.reflecting.on.your.typical.eating. mean_pct_steps_afternoon -0.06
sd_wake_clock_h dmlgrain..In.a.typical.day..how.often.do.you.cons -0.05
sd_onset_clock_h dmlvex..In.a.typical.week..how.often.do.you.engag -0.05
mean_pct_steps_afternoon age -0.05
dmlgrain..In.a.typical.day..how.often.do.you.cons sualcage 0.05
sualcage years_of_education 0.05
dmlvex..In.a.typical.week..how.often.do.you.engag mean_pct_steps_evening 0.05
cv_total_steps dmlvex..In.a.typical.week..how.often.do.you.engag 0.05
stress_overall_daily_mean_stress whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.05
dmlpor..When.reflecting.on.your.typical.eating.ha sualckncf 0.05
dmlgrain..In.a.typical.day..how.often.do.you.cons sri -0.04
cv_total_steps diet6..How.many.times.a.week.did.you.eat.regular. -0.04
sd_wake_clock_h sualcage -0.04
diet7..How.many.times.a.week.did.you.eat.desserts bmi_vsorres..BMI 0.04
dmlpor..When.reflecting.on.your.typical.eating.ha sri 0.04
diet7..How.many.times.a.week.did.you.eat.desserts age 0.04
cv_total_steps sualckncf 0.04
diet6..How.many.times.a.week.did.you.eat.regular. pulse_vsorres..Heart.Rate..bpm. -0.04
dmlpor..When.reflecting.on.your.typical.eating.ha bmi_vsorres..BMI -0.04
dmlfrveg..In.a.typical.day..how.many.servings.of. mean_pct_steps_evening 0.04
sd_onset_clock_h mean_pct_steps_afternoon 0.04
dmlact..What.would.you.consider.your.typical.acti mean_pct_steps_evening -0.04
sualckncf years_of_education -0.04
diet7..How.many.times.a.week.did.you.eat.desserts sualckncf -0.04
diet6..How.many.times.a.week.did.you.eat.regular. age -0.04
mean_pct_steps_morning pulse_vsorres..Heart.Rate..bpm. -0.04
sd_onset_clock_h diet6..How.many.times.a.week.did.you.eat.regular. -0.04
diet7..How.many.times.a.week.did.you.eat.desserts sualcage 0.04
whr_vsorres..Waist.to.Hip.Ratio..WHR. bmi_vsorres..BMI 0.03
mean_pct_steps_afternoon whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.03
mean_pct_steps_evening mean_pct_steps_morning -0.03
stress_overall_daily_mean_stress diet6..How.many.times.a.week.did.you.eat.regular. 0.03
sd_wake_clock_h dmlfrveg..In.a.typical.day..how.many.servings.of. 0.03
diet6..How.many.times.a.week.did.you.eat.regular. sualckncf -0.03
stress_overall_daily_mean_stress dmlpor..When.reflecting.on.your.typical.eating.ha -0.03
diet7..How.many.times.a.week.did.you.eat.desserts mean_pct_steps_morning 0.03
dmlact..What.would.you.consider.your.typical.acti mean_pct_steps_morning 0.03
cv_total_steps hr_from_ecg -0.03
dmlgrain..In.a.typical.day..how.often.do.you.cons mean_pct_steps_afternoon -0.03
sd_wake_clock_h dmlhex..How.often.would.you.say.that.you.engage.i 0.03
dmlfrveg..In.a.typical.day..how.many.servings.of. age -0.02
diet7..How.many.times.a.week.did.you.eat.desserts whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.02
sd_onset_clock_h whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.02
diet6..How.many.times.a.week.did.you.eat.regular. whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.02
sd_wake_clock_h whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.02
sri sualckncf 0.02
diet7..How.many.times.a.week.did.you.eat.desserts dmlact..What.would.you.consider.your.typical.acti 0.02
sd_wake_clock_h dmlpor..When.reflecting.on.your.typical.eating.ha -0.02
cv_total_steps sd_wake_clock_h 0.02
mean_pct_steps_morning whr_vsorres..Waist.to.Hip.Ratio..WHR. -0.02
dmlsugar..When.reflecting.on.your.typical.eating. age -0.02
dmlpor..When.reflecting.on.your.typical.eating.ha hr_from_ecg -0.02
dmlgrain..In.a.typical.day..how.often.do.you.cons hr_from_ecg -0.02
dmlvex..In.a.typical.week..how.often.do.you.engag mean_pct_steps_morning -0.02
dmlsugar..When.reflecting.on.your.typical.eating. sualcage -0.02
diet6..How.many.times.a.week.did.you.eat.regular. bmi_vsorres..BMI -0.02
diet7..How.many.times.a.week.did.you.eat.desserts mean_pct_steps_evening 0.01
dmlsugar..When.reflecting.on.your.typical.eating. sualckncf -0.01
dmlpor..When.reflecting.on.your.typical.eating.ha mean_pct_steps_afternoon 0.01
dmlhex..How.often.would.you.say.that.you.engage.i mean_pct_steps_morning 0.01
dmlpor..When.reflecting.on.your.typical.eating.ha years_of_education -0.01
dmlgrain..In.a.typical.day..how.often.do.you.cons whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.01
sd_onset_clock_h diet7..How.many.times.a.week.did.you.eat.desserts 0.01
mean_pct_steps_evening sri 0.01
cv_total_steps dmlfrveg..In.a.typical.day..how.many.servings.of. 0.01
sualcage age -0.01
cv_total_steps years_of_education 0.01
dmlhex..How.often.would.you.say.that.you.engage.i mean_pct_steps_afternoon 0.01
mean_pct_steps_morning sualckncf -0.01
sd_onset_clock_h sri 0.00
mean_pct_steps_evening bmi_vsorres..BMI 0.00
mean_pct_steps_afternoon sualckncf 0.00
hr_from_ecg sualcage 0.00
dmlhex..How.often.would.you.say.that.you.engage.i sualcage 0.00
sualckncf age 0.00
mean_pct_steps_afternoon sualcage 0.00
dmlsugar..When.reflecting.on.your.typical.eating. whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.00
sualckncf sualcage 0.00
# 7. Summary description
# Example summary_stats dataframe
summary_stats <- cor_table_unique %>%
  dplyr::summarize(
    Mean_Correlation = mean(Correlation, na.rm = TRUE),
    Median_Correlation = median(Correlation, na.rm = TRUE),
    Max_Correlation = max(Correlation, na.rm = TRUE),
    Min_Correlation = min(Correlation, na.rm = TRUE)
  )

# Display as horizontal table
kable(summary_stats, "html", caption = "Summary of Correlations") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, font_size = 14) %>%
  row_spec(0, bold = TRUE, background = "#A5D8FF")
Summary of Correlations
Mean_Correlation Median_Correlation Max_Correlation Min_Correlation
0.0260519 0.0115898 0.7517405 -0.6285951
#hr_from_ecg pulse_vsorres..Heart.Rate..bpm.       0.752           0.752

5.0.3 Random Forest

X <- df_test %>%
  dplyr::select(
     fh_dm2sb_cat,
     years_of_education, 
     age, 
     bmi_vsorres..BMI, 
     whr_vsorres..Waist.to.Hip.Ratio..WHR.,
     pulse_vsorres..Heart.Rate..bpm.,
     sualcage, 
     sualckncf,
     #hrv_rmssd,
     sri,
     mean_pct_steps_morning,
     mean_pct_steps_evening,
     dmlhex..How.often.would.you.say.that.you.engage.i,
     dmlvex..In.a.typical.week..how.often.do.you.engag,
     hr_from_ecg,
     mean_active_minutes,
     diet6..How.many.times.a.week.did.you.eat.regular.,
     diet1..How.many.times.a.week.did.you.eat.fast.foo,
     dmlgrain..In.a.typical.day..how.often.do.you.cons,
     dmlsugar..When.reflecting.on.your.typical.eating.,
    cv_total_steps,
    stress_overall_daily_mean_stress) %>%
  mutate(across(where(is.factor), ~ factor(trimws(.))))

# ------------------------------
# 2. Define target variable
# ------------------------------
# 1. Clean target
X <- X %>% mutate(across(where(is.numeric), as.double))

y <- factor(trimws(df_test$IR_label))
levels(y) <- make.names(levels(y))  # ensures valid R names

# 2. Combine with predictors
df_model <- X
df_model$y <- y
#df_model <- na.omit(df_model)

X_clean <- df_model %>% select(-y)
y_clean <- df_model$y

# 3. Train/test split
set.seed(123)
train_index <- createDataPartition(y_clean, p = 0.8, list = FALSE)
X_train <- X_clean[train_index, ]
X_test  <- X_clean[-train_index, ]
y_train <- y_clean[train_index]
y_test  <- y_clean[-train_index]

# 4. Train RF
set.seed(123)
train_control <- trainControl(
  method = "cv",
  number = 10,
  sampling = "up" ,
  savePredictions = "final",
  classProbs = TRUE
)

set.seed(123)
model_rf <- train(
  x = X_train,
  y = y_train,
  method = "rf",
  trControl = train_control,
  preProcess = c("medianImpute"),
  tuneLength = 5
)

# 5. Evaluate
pred_class <- predict(model_rf, X_test)
cm <- caret::confusionMatrix(pred_class, y_test)

accuracy <- as.numeric(cm$overall["Accuracy"])
sensitivity <- as.numeric(cm$byClass["Sensitivity"])
specificity <- as.numeric(cm$byClass["Specificity"])
precision <- as.numeric(cm$byClass["Pos Pred Value"])
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)

# Compute Balanced Accuracy
balanced_accuracy <- (sensitivity + specificity) / 2

# Create metrics dataframe including Balanced Accuracy
metrics <- data.frame(
  Accuracy = accuracy,
  Sensitivity = sensitivity,
  Specificity = specificity,
  Balanced_Accuracy = balanced_accuracy,
  Precision = precision,
  F1_Score = f1_score
)

# Example metrics dataframe
metrics <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Balanced Accuracy", "Precision", "F1 Score"),
  Value = c(accuracy, sensitivity, specificity, balanced_accuracy, precision, f1_score)
)

# Create a nice HTML table
kable(metrics, "html", caption = "Model Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = T, font_size = 14) %>%
  row_spec(0, bold = TRUE, background = "#A5D8FF") %>%
  column_spec(2, bold = TRUE)
Model Performance Metrics
Metric Value
Accuracy 0.7407407
Sensitivity 0.8333333
Specificity 0.6666667
Balanced Accuracy 0.7500000
Precision 0.6666667
F1 Score 0.7407407
# 2. ROC Curve & AUC
# ------------------------------
pred_probs <- predict(model_rf, X_test, type = "prob")[, "IR"]
roc_obj <- roc(y_test, pred_probs, levels = rev(levels(y_test)))

# ROC plot using ggplot2
roc_df <- data.frame(
  FPR = 1 - roc_obj$specificities,
  TPR = roc_obj$sensitivities
)

roc_plot = ggplot(roc_df, aes(x = FPR, y = TPR)) +
  geom_line(color = "#2C7BB6", size = 1.2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey50") +
  annotate("text", x = 0.6, y = 0.1, label = paste0("AUC = ", round(auc(roc_obj), 3)), size = 5) +
  labs(title = "ROC Curve", x = "False Positive Rate (1 - Specificity)", y = "True Positive Rate (Sensitivity)") +
  theme_classic(base_size = 14) +
  theme(aspect.ratio=0.9)


# ------------------------------
# 3. Precision-Recall Curve & PRAUC
# ------------------------------
# Convert y_test to binary numeric: 1 = IR, 0 = Non-IR
y_bin <- ifelse(y_test == "IR", 1, 0)

pr_obj <- pr.curve(scores.class0 = pred_probs, weights.class0 = y_bin, curve = TRUE)

pr_df <- data.frame(
  Recall = pr_obj$curve[,1],
  Precision = pr_obj$curve[,2]
)

pr_plot = ggplot(pr_df, aes(x = Recall, y = Precision)) +
  geom_line(color = "#D7191C", size = 1.2) +
  annotate("text", x = 0.6, y = 0.1, label = paste0("PRAUC = ", round(pr_obj$auc.integral, 3)), size = 5) +
  labs(title = "Precision-Recall Curve", x = "Recall (Sensitivity)", y = "Precision") +
  theme_classic(base_size = 14) +
  theme(aspect.ratio=0.9)


combined_plot <- roc_plot + pr_plot + plot_layout(ncol = 2)
combined_plot 

# shap <- Shapley$new(predictor, x.interest = X_train[1, ]) # example: first row
# # If you want all rows, you can loop or sample (expensive for large datasets)
# 
# # Extract SHAP values for the first observation
# shap_values <- shap$results
# print(shap_values)

# ------------------------------
# 4. Summary plot (global feature importance via SHAP)
# ------------------------------
# shapley_summary <- FeatureImp$new(predictor, loss = "ce")  # cross-entropy loss for classification
# plot(shapley_summary)  # shows global feature importance
n_runs <- 50
all_var_imp <- matrix(NA, nrow = ncol(X_train), ncol = n_runs)
rownames(all_var_imp) <- colnames(X_train)

set.seed(123)
for (i in 1:n_runs) {
  train_idx <- createDataPartition(y_clean, p = 0.7, list = FALSE)
  X_tr <- X_clean[train_idx, ]
  X_te <- X_clean[-train_idx, ]
  y_tr <- y_clean[train_idx]
  y_te <- y_clean[-train_idx]
  
  rf <- train(
    x = X_tr,
    y = y_tr,
    method = "rf",
    trControl = trainControl(method = "cv", number = 5),
    preProcess = c("medianImpute"),
    tuneLength = 5
  )
  
  imp <- varImp(rf, scale = TRUE)$importance
  all_var_imp[, i] <- imp$Overall
}

var_imp_mean <- rowMeans(all_var_imp)
var_imp_sd <- apply(all_var_imp, 1, sd)

var_imp_stable <- data.frame(
  Feature = rownames(all_var_imp),
  MeanImportance = var_imp_mean,
  SDImportance = var_imp_sd
) %>% arrange(desc(MeanImportance))

print(head(var_imp_stable, 20))
##                                                                                             Feature
## bmi_vsorres..BMI                                                                   bmi_vsorres..BMI
## whr_vsorres..Waist.to.Hip.Ratio..WHR.                         whr_vsorres..Waist.to.Hip.Ratio..WHR.
## stress_overall_daily_mean_stress                                   stress_overall_daily_mean_stress
## hr_from_ecg                                                                             hr_from_ecg
## sri                                                                                             sri
## mean_active_minutes                                                             mean_active_minutes
## pulse_vsorres..Heart.Rate..bpm.                                     pulse_vsorres..Heart.Rate..bpm.
## mean_pct_steps_morning                                                       mean_pct_steps_morning
## mean_pct_steps_evening                                                       mean_pct_steps_evening
## age                                                                                             age
## cv_total_steps                                                                       cv_total_steps
## sualcage                                                                                   sualcage
## years_of_education                                                               years_of_education
## dmlhex..How.often.would.you.say.that.you.engage.i dmlhex..How.often.would.you.say.that.you.engage.i
## diet6..How.many.times.a.week.did.you.eat.regular. diet6..How.many.times.a.week.did.you.eat.regular.
## dmlvex..In.a.typical.week..how.often.do.you.engag dmlvex..In.a.typical.week..how.often.do.you.engag
## fh_dm2sb_cat                                                                           fh_dm2sb_cat
## dmlsugar..When.reflecting.on.your.typical.eating. dmlsugar..When.reflecting.on.your.typical.eating.
## dmlgrain..In.a.typical.day..how.often.do.you.cons dmlgrain..In.a.typical.day..how.often.do.you.cons
## diet1..How.many.times.a.week.did.you.eat.fast.foo diet1..How.many.times.a.week.did.you.eat.fast.foo
##                                                   MeanImportance SDImportance
## bmi_vsorres..BMI                                       97.794220     7.545864
## whr_vsorres..Waist.to.Hip.Ratio..WHR.                  72.945954    17.990147
## stress_overall_daily_mean_stress                       41.423222    19.519506
## hr_from_ecg                                            28.679576    15.919824
## sri                                                    26.796903    16.669371
## mean_active_minutes                                    26.728777    13.692341
## pulse_vsorres..Heart.Rate..bpm.                        26.363365    14.957959
## mean_pct_steps_morning                                 22.181617    11.449555
## mean_pct_steps_evening                                 21.277253    11.878361
## age                                                    20.500170    11.427062
## cv_total_steps                                         20.180248    11.926483
## sualcage                                               18.225723     9.409087
## years_of_education                                     17.030107    12.084327
## dmlhex..How.often.would.you.say.that.you.engage.i      15.056297    13.616977
## diet6..How.many.times.a.week.did.you.eat.regular.      14.953623     8.906635
## dmlvex..In.a.typical.week..how.often.do.you.engag       8.522026     7.229816
## fh_dm2sb_cat                                            6.963167     7.054268
## dmlsugar..When.reflecting.on.your.typical.eating.       4.843838     3.347394
## dmlgrain..In.a.typical.day..how.often.do.you.cons       4.475492     4.660171
## diet1..How.many.times.a.week.did.you.eat.fast.foo       3.068553     3.475977
# ------------------------------
# 9. SHAP values (conditional contribution)

library(randomForest)
library(iml)

# ------------------------------
predictor <- Predictor$new(
  model = model_rf,
  data = X_train,
  y = y_train,
  type = "prob"
)

# Example: compute SHAP for first 100 rows
sample_idx <- sample(1:nrow(X_train), min(100, nrow(X_train)))
shap_values_list <- lapply(sample_idx, function(i) {
  shap <- Shapley$new(predictor, x.interest = X_train[i, ])
  shap$results
})

shap_all <- do.call(rbind, shap_values_list)
shap_global <- shap_all %>%
  group_by(feature) %>%
  summarise(mean_abs_shap = mean(abs(phi))) %>%
  arrange(desc(mean_abs_shap))

# Combine SHAP results
shap_all <- do.call(rbind, shap_values_list)

# Summarize global SHAP values
shap_global <- shap_all %>%
  group_by(feature) %>%
  summarise(mean_abs_shap = mean(abs(phi)), .groups = "drop") %>%
  arrange(desc(mean_abs_shap))

# Create a nice table with kable
shap_global %>%
  mutate(rank = row_number()) %>%  # optional: add a rank column
  select(rank, everything()) %>%
  kable(
    caption = "Global SHAP Feature Importance",
    col.names = c("Rank", "Feature", "Mean |SHAP|"),
    digits = 3,
    align = c("c", "l", "c")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center"
  )
Global SHAP Feature Importance
Rank Feature Mean &#124;SHAP&#124;
1 bmi_vsorres..BMI 0.128
2 whr_vsorres..Waist.to.Hip.Ratio..WHR. 0.077
3 stress_overall_daily_mean_stress 0.028
4 diet6..How.many.times.a.week.did.you.eat.regular. 0.026
5 hr_from_ecg 0.023
6 dmlhex..How.often.would.you.say.that.you.engage.i 0.023
7 pulse_vsorres..Heart.Rate..bpm. 0.018
8 sri 0.015
9 mean_pct_steps_evening 0.014
10 age 0.014
11 years_of_education 0.014
12 cv_total_steps 0.014
13 mean_active_minutes 0.012
14 sualcage 0.011
15 mean_pct_steps_morning 0.011
16 fh_dm2sb_cat 0.010
17 dmlsugar..When.reflecting.on.your.typical.eating. 0.010
18 dmlvex..In.a.typical.week..how.often.do.you.engag 0.008
19 dmlgrain..In.a.typical.day..how.often.do.you.cons 0.005
20 sualckncf 0.005
21 diet1..How.many.times.a.week.did.you.eat.fast.foo 0.004
library(dplyr)
library(ggplot2)
library(tidyr)

# ------------------------------
# 1. Combine SHAP with categories
# ------------------------------
shap_categorized <- shap_global %>%
  left_join(feature_categories, by = c("feature" = "Feature"))

# ------------------------------
# 2. Compute total and percent contribution by category
# ------------------------------
category_shap <- shap_categorized %>%
  group_by(Category) %>%
  summarise(
    total_shap = sum(mean_abs_shap),
    .groups = "drop"
  ) %>%
  mutate(percent_contribution = total_shap / sum(total_shap) * 100)

# ------------------------------
# 3. Plot
# ------------------------------
ggplot(category_shap, aes(x = reorder(Category, -percent_contribution), y = percent_contribution, fill = Category)) +
  geom_bar(stat = "identity", color = "black") +
  geom_text(aes(label = paste0(round(percent_contribution, 1), "%")), vjust = -0.5, size = 4) +
  labs(
    title = "SHAP Feature Contributions by Category",
    x = "Feature Category",
    y = "Percent of Total Mean |SHAP|"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(0, max(category_shap$percent_contribution) * 1.1))

6 External validation

6.1 Confusion Matrix

df_external  <- read.csv("~/external_test.csv", header = TRUE, stringsAsFactors = FALSE)

unhealthy_vars <- c(
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",  
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",  
  "diet6..How.many.times.a.week.did.you.eat.regular.",  
  "diet7..How.many.times.a.week.did.you.eat.desserts",  
  "diet9..How.many.servings.of.fruit.juice.did.you.d"
)

df_external[unhealthy_vars] <- 2 - df_external[unhealthy_vars]
df_external$diet_unhealthy_score <- rowSums(df_external[unhealthy_vars], na.rm = TRUE)


predictor_names <- c(
  "fh_dm2sb_cat",
  "years_of_education",
  "age",
  "bmi_vsorres..BMI",
  "whr_vsorres..Waist.to.Hip.Ratio..WHR.",
  "pulse_vsorres..Heart.Rate..bpm.",
  "sualcage",
  "sualckncf",
  "diet_unhealthy_score",
  "sri",
  "mean_pct_steps_morning",
  "mean_pct_steps_afternoon",
  "mean_pct_steps_evening",
  "diet4..How.many.regular.sodas.or.glasses.of.sweet",
  "dmlhex..How.often.would.you.say.that.you.engage.i",
  "dmlact..What.would.you.consider.your.typical.acti",
  "dmlvex..In.a.typical.week..how.often.do.you.engag",
  "hr_from_ecg",
  "mean_active_minutes",
  "diet7..How.many.times.a.week.did.you.eat.desserts",
  "diet6..How.many.times.a.week.did.you.eat.regular.",
  "diet1..How.many.times.a.week.did.you.eat.fast.foo",
  "diet9..How.many.servings.of.fruit.juice.did.you.d",
  "dmlpor..When.reflecting.on.your.typical.eating.ha",
  "dmlfrveg..In.a.typical.day..how.many.servings.of.",
  "dmlgrain..In.a.typical.day..how.often.do.you.cons",
  "dmlsugar..When.reflecting.on.your.typical.eating.",
  "sd_wake_clock_h",
  "sd_onset_clock_h",
  "cv_total_steps",
  "stress_overall_daily_mean_stress")

df_external$fh_dm2sb_cat <- trimws(tolower(df_external$fh_dm2sb_cat))
df_external$fh_dm2sb_cat <- factor(df_external$fh_dm2sb_cat, levels=c("no","yes"))

mode_val <- names(sort(table(X_train$fh_dm2sb_cat), decreasing = TRUE))[1]

# Replace NA in external dataset
df_external$fh_dm2sb_cat[is.na(df_external$fh_dm2sb_cat)] <- mode_val

# Ensure factor levels match training set
df_external$fh_dm2sb_cat <- factor(df_external$fh_dm2sb_cat, levels = levels(X_train$fh_dm2sb_cat))

X_ext <- df_external %>% dplyr::select(all_of(predictor_names))

pre_proc <- preProcess(X_train, method = "medianImpute")
X_ext_clean <- predict(pre_proc, X_ext)

y_ext <-  df_external$IR_label

# Replace "Non-IR" → "Non.IR"
df_external$IR_label <- gsub("^Non-IR$", "Non.IR", df_external$IR_label)

# Fix any spacing issues
df_external$IR_label <- trimws(df_external$IR_label)

# Convert to factor with correct levels
y_ext <- factor(df_external$IR_label)

pred_class_ext <- predict(model_rf, X_ext_clean)
cm_ext <- caret::confusionMatrix(pred_class_ext, y_ext)

library(ggplot2)
library(dplyr)
library(scales)

# ------------------------------
# 1. Convert confusionMatrix to tidy dataframe

total_n <- length(y_train) + length(y_test)

split_counts <- data.frame(
  Set = factor(c("Train", "Test"), levels = c("Train", "Test")),  # force order
  N = c(length(y_train), length(y_test))
) %>%
  mutate(Percent = N / total_n * 100)  # percentage

# ------------------------------
# 2. Plot
# ------------------------------
split_plot <- ggplot(split_counts, aes(x = Set, y = N, fill = Set)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = paste0(N, "\n(", round(Percent,1), "%)")), 
            vjust = -0.5, size = 5) +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Train vs Test Sample Sizes", y = "Number of Samples", x = "", face = "bold") +
  theme_minimal(base_size = 8) +
  theme(legend.position = "none") +
  theme(aspect.ratio=0.9)

# ------------------------------
cm_ext_df <- as.data.frame(cm_ext$table) %>%
  dplyr::rename(
    True = Reference,
    Predicted = Prediction,
    Count = Freq
  ) %>%
  group_by(True) %>%
  mutate(
    Percent = Count / sum(Count)  # proportion per true class
  ) %>%
  ungroup()

# ------------------------------
# 2. Plot confusion matrix heatmap
# ------------------------------
cm_ext_plot <- ggplot(cm_ext_df, aes(x = Predicted, y = True, fill = Percent)) +
  geom_tile(color = "white", size = 0.5) +
  geom_text(aes(label = paste0(Count, "\n(", round(Percent*100,1), "%)")), size = 6) +
  scale_fill_gradient(low = "#DCE6F1", high = "#2C7BB6", labels = percent_format()) +
  labs(
    title = "External Test Set Confusion Matrix",
    x = "Predicted Label",
    y = "True Label",
    fill = "Proportion"
  ) +
  theme_minimal(base_size = 8) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
    axis.text.y = element_text(face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.title = element_text(face = "bold"))

# ------------------------------
# 3. Display the plot
# ------------------------------
combined_plot <- split_plot + cm_ext_plot + plot_layout(ncol = 2, widths = c(2, 4))
combined_plot

# ------------------------------
# 3. Display
# ------------------------------

6.2 RF Performance Metrics

 metrics_ext <- data.frame(
  Accuracy = cm_ext$overall["Accuracy"],
  Sensitivity = cm_ext$byClass["Sensitivity"],
  Specificity = cm_ext$byClass["Specificity"],
  Balanced_Accuracy = (cm_ext$byClass["Sensitivity"] + cm_ext$byClass["Specificity"]) / 2,  # <-- new
  Precision = cm_ext$byClass["Pos Pred Value"],
  F1_Score = 2 * (cm_ext$byClass["Pos Pred Value"] * cm_ext$byClass["Sensitivity"]) /
    (cm_ext$byClass["Pos Pred Value"] + cm_ext$byClass["Sensitivity"])
)

library(knitr)
library(kableExtra)
library(dplyr)

# ------------------------------
# 1. Clean numeric metrics for display
# ------------------------------
metrics_ext_clean <- metrics_ext %>%
  # Convert all columns to numeric
  mutate(across(everything(), ~ as.numeric(.))) %>%
  # Round to 3 decimal places for readability
  mutate(across(everything(), ~ round(., 3))) %>%
  # Convert to long format for nice table
  tidyr::pivot_longer(
    cols = everything(),
    names_to = "Metric",
    values_to = "Value"
  )

# ------------------------------
# 2. Create HTML kable
# ------------------------------
kable(metrics_ext_clean, "html", caption = "Extended Model Performance Metrics") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = T,
    font_size = 14
  ) %>%
  row_spec(0, bold = TRUE, background = "#A5D8FF") %>%
  column_spec(2, bold = TRUE)
Extended Model Performance Metrics
Metric Value
Accuracy 0.696
Sensitivity 0.577
Specificity 0.800
Balanced_Accuracy 0.688
Precision 0.714
F1_Score 0.638
 library(dplyr)
library(ggplot2)
library(gridExtra)

# ------------------------------
# 1. Train/Test counts (unchanged)
# ------------------------------

6.3 Feature Importance