distance_clustering-consistency.R

library(vegan)

## Warning: package 'vegan' was built under R version 4.5.3

## Loading required package: permute

## Warning: package 'permute' was built under R version 4.5.3

library(cluster)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tibble)
library(ggdendro)

set.seed(123)

#========================================
# 1. Simulate a long-format data frame
# The last column is Concentration
#========================================
n_dili   <- 30
n_nodili <- 30
p        <- 8
n        <- n_dili + n_nodili

sample_id <- paste0("S", 1:n)
group_vec <- c(rep("DILI", n_dili), rep("No-DILI", n_nodili))
grp <- factor(group_vec, levels = c("DILI", "No-DILI"))

base_mat <- matrix(rnorm(n * p, mean = 0, sd = 0.7), nrow = n, ncol = p)
colnames(base_mat) <- paste0("F", 1:p)
rownames(base_mat) <- sample_id

delta <- c(1.1, 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4)
conc_levels <- c("Low", "Mid", "High")
effect_map  <- c(Low = 0.3, Mid = 0.8, High = 1.4)

df_list <- lapply(conc_levels, function(cc) {
  eff <- effect_map[cc]
  
  X <- base_mat + matrix(rnorm(n * p, mean = 0, sd = 0.5), nrow = n, ncol = p)
  X[grp == "DILI", ] <- sweep(X[grp == "DILI", ], 2, eff * delta, "+")
  
  data.frame(
    Sample = sample_id,
    Group = grp,
    X,
    Concentration = cc,
    check.names = FALSE
  )
})

df_all <- bind_rows(df_list)
df_all$Concentration <- factor(df_all$Concentration, levels = c("Low", "Mid", "High"))

#========================================
# 2. Apply common scaling
# Use Low concentration as the reference
#========================================
feature_cols <- names(df_all)[3:(ncol(df_all) - 1)]

df_low <- df_all %>% filter(Concentration == "Low")

center_ref <- colMeans(df_low[, feature_cols])
scale_ref  <- apply(df_low[, feature_cols], 2, sd)
scale_ref[scale_ref == 0] <- 1

df_all_scaled <- df_all
df_all_scaled[, feature_cols] <- scale(
  df_all[, feature_cols],
  center = center_ref,
  scale = scale_ref
)

head(df_all_scaled)

##        Sample Group          F1         F2         F3          F4         F5
## S1...1     S1  DILI -0.39742020  0.3893341  1.1118386 -0.14244032 -1.3930019
## S2...2     S2  DILI  0.09653414 -0.4422052 -0.6092131  2.00768462 -1.5580999
## S3...3     S3  DILI  2.39543664  0.3773743 -0.1885306  1.05497285  1.4288079
## S4...4     S4  DILI  0.24564405 -1.2860802 -0.9549367 -0.49287984 -1.9570054
## S5...5     S5  DILI  0.68588627 -0.1295139  2.5278813 -1.25615473 -0.2966851
## S6...6     S6  DILI  2.17953112  0.9886829 -0.4562969  0.07433245  1.5312515
##                 F6            F7          F8 Concentration
## S1...1  0.07855677 -0.5347245949  0.75259189           Low
## S2...2  0.09961022  2.0169168585 -0.30710173           Low
## S3...3 -0.53398116 -0.0007498042  0.18472726           Low
## S4...4 -1.26409704  0.8641514777  1.85514168           Low
## S5...5 -0.87150598  1.0266623660 -0.04092623           Low
## S6...6  0.11050652 -0.1351705618 -0.79777671           Low

#========================================
# 3. Helper function
#========================================
calc_gini_purity <- function(cluster, grp) {
  tab <- table(cluster, grp)
  prop_tab <- prop.table(tab, 1)
  gp_each <- apply(prop_tab, 1, function(x) sum(x^2))
  weighted.mean(gp_each, w = rowSums(tab))
}

#========================================
# 4. Analysis function for one concentration
#========================================
analyze_one_df <- function(df_sub, feature_cols) {
  
  grp_sub <- factor(df_sub$Group, levels = c("DILI", "No-DILI"))
  
  X <- as.matrix(df_sub[, feature_cols])
  rownames(X) <- df_sub$Sample
  
  # Distance matrix
  d <- dist(X, method = "euclidean")
  
  # Hierarchical clustering
  hc <- hclust(d, method = "ward.D2")
  cl <- cutree(hc, k = 2)
  
  # Gini purity
  gini_purity <- calc_gini_purity(cl, grp_sub)
  
  # Label silhouette
  sil_label <- silhouette(as.integer(grp_sub), d)
  label_sil <- mean(sil_label[, 3])
  
  # PERMANOVA
  meta <- data.frame(Group = grp_sub)
  adonis_res <- adonis2(d ~ Group, data = meta, permutations = 999)
  permanova_r2 <- unname(adonis_res$R2[1])
  permanova_p  <- unname(adonis_res$`Pr(>F)`[1])
  
  # PCoA
  pcoa <- cmdscale(d, k = 2, eig = TRUE)
  
  pcoa_df <- tibble(
    Sample = df_sub$Sample,
    Group = grp_sub,
    Concentration = df_sub$Concentration,
    Axis1 = pcoa$points[, 1],
    Axis2 = pcoa$points[, 2]
  )
  
  metrics <- tibble(
    Concentration = as.character(df_sub$Concentration[1]),
    GiniPurity = as.numeric(gini_purity),
    LabelSilhouette = as.numeric(label_sil),
    PERMANOVA_R2 = as.numeric(permanova_r2),
    PERMANOVA_p = as.numeric(permanova_p)
  )
  
  list(
    metrics = metrics,
    pcoa_df = pcoa_df,
    hc = hc
  )
}

#========================================
# 5. Run the analysis by concentration
#========================================
split_list <- split(df_all_scaled, df_all_scaled$Concentration)

analysis_list <- lapply(split_list, function(x) analyze_one_df(x, feature_cols))

metrics_df <- bind_rows(lapply(analysis_list, `[[`, "metrics"))
pcoa_all   <- bind_rows(lapply(analysis_list, `[[`, "pcoa_df"))

metrics_df$Concentration <- factor(metrics_df$Concentration, levels = c("Low", "Mid", "High"))
pcoa_all$Concentration   <- factor(pcoa_all$Concentration, levels = c("Low", "Mid", "High"))

print(metrics_df)

## # A tibble: 3 × 5
##   Concentration GiniPurity LabelSilhouette PERMANOVA_R2 PERMANOVA_p
##   <fct>              <dbl>           <dbl>        <dbl>       <dbl>
## 1 Low                0.502          0.0132       0.0296       0.068
## 2 Mid                0.542          0.0638       0.0887       0.001
## 3 High               0.75           0.243        0.291        0.001

#========================================
# 6. Plot 1: Three metrics across concentrations
#========================================
metrics_long <- metrics_df %>%
  select(Concentration, GiniPurity, LabelSilhouette, PERMANOVA_R2) %>%
  pivot_longer(-Concentration, names_to = "Metric", values_to = "Value")

p1 <- ggplot(metrics_long, aes(x = Concentration, y = Value, group = Metric, color = Metric)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  theme_classic(base_size = 13) +
  labs(
    title = "Metrics across concentrations",
    x = "Concentration",
    y = "Value"
  )

print(p1)

#========================================
# 7. Plot 2: PCoA
#========================================
p2 <- ggplot(pcoa_all, aes(x = Axis1, y = Axis2, color = Group)) +
  geom_point(size = 2.5, alpha = 0.9) +
  stat_ellipse(type = "norm", linewidth = 0.8) +
  facet_wrap(~ Concentration, scales = "free") +
  theme_classic(base_size = 13) +
  labs(
    title = "PCoA",
    x = "Axis 1",
    y = "Axis 2"
  )

print(p2)

#========================================
# 8. Plot 3: Dendrogram using ggplot2
# Sample labels are colored by Group
#========================================
plot_dendrogram_gg <- function(hc, sub_df, title_text) {
  dend <- as.dendrogram(hc)
  dend_data <- ggdendro::dendro_data(dend, type = "rectangle")
  
  seg_df <- dend_data$segments
  lab_df <- dend_data$labels
  
  lab_df$label <- as.character(lab_df$label)
  sub_df$Sample <- as.character(sub_df$Sample)
  sub_df$Group <- factor(sub_df$Group, levels = c("DILI", "No-DILI"))
  
  lab_df <- lab_df %>%
    left_join(
      sub_df %>% select(Sample, Group),
      by = c("label" = "Sample")
    )
  
  y_offset <- max(seg_df$y) * 0.03
  
  ggplot() +
    geom_segment(
      data = seg_df,
      aes(x = x, y = y, xend = xend, yend = yend),
      linewidth = 0.4
    ) +
    geom_text(
      data = lab_df,
      aes(x = x, y = -y_offset, label = label, color = Group),
      angle = 90,
      hjust = 1,
      size = 2.6
    ) +
    scale_color_manual(
      values = c("DILI" = "tomato", "No-DILI" = "steelblue"),
      drop = FALSE
    ) +
    labs(
      title = title_text,
      x = NULL,
      y = "Height",
      color = NULL
    ) +
    coord_cartesian(ylim = c(-max(seg_df$y) * 0.08, max(seg_df$y))) +
    theme_classic(base_size = 12) +
    theme(
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      plot.margin = margin(10, 10, 30, 10)
    )
}

p_dend_low  <- plot_dendrogram_gg(analysis_list[["Low"]]$hc,  split_list[["Low"]],  "Dendrogram - Low")
p_dend_mid  <- plot_dendrogram_gg(analysis_list[["Mid"]]$hc,  split_list[["Mid"]],  "Dendrogram - Mid")
p_dend_high <- plot_dendrogram_gg(analysis_list[["High"]]$hc, split_list[["High"]], "Dendrogram - High")

print(p_dend_low)

print(p_dend_mid)

print(p_dend_high)

#========================================
# 9. Interpretation
#========================================
cat("\nInterpretation:\n")

## 
## Interpretation:

cat("- GiniPurity ranges from 0 to 1. Higher values indicate that sample labels are more homogeneous within clusters, whereas lower values indicate greater mixing of DILI and No-DILI samples within clusters.\n")

## - GiniPurity ranges from 0 to 1. Higher values indicate that sample labels are more homogeneous within clusters, whereas lower values indicate greater mixing of DILI and No-DILI samples within clusters.

cat("- LabelSilhouette ranges from -1 to 1. Higher values indicate better separation between DILI and No-DILI samples in the feature space, values near 0 indicate substantial overlap, and negative values indicate poor separation.\n")

## - LabelSilhouette ranges from -1 to 1. Higher values indicate better separation between DILI and No-DILI samples in the feature space, values near 0 indicate substantial overlap, and negative values indicate poor separation.

cat("- PERMANOVA_R2 ranges from 0 to 1. Higher values indicate that the group label explains a larger proportion of the overall multivariate variation. The associated PERMANOVA p-value indicates whether the group difference is statistically significant.\n")

## - PERMANOVA_R2 ranges from 0 to 1. Higher values indicate that the group label explains a larger proportion of the overall multivariate variation. The associated PERMANOVA p-value indicates whether the group difference is statistically significant.

cat("- If GiniPurity, LabelSilhouette, and PERMANOVA_R2 all increase with concentration, this suggests that the clustering structure becomes increasingly consistent with the DILI / No-DILI labels.\n")

## - If GiniPurity, LabelSilhouette, and PERMANOVA_R2 all increase with concentration, this suggests that the clustering structure becomes increasingly consistent with the DILI / No-DILI labels.

distance_clustering-consistency.R

xut2

2026-04-21