library(vegan)
## Warning: package 'vegan' was built under R version 4.5.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.5.3
library(cluster)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tibble)
library(ggdendro)
set.seed(123)
#========================================
# 1. Simulate a long-format data frame
# The last column is Concentration
#========================================
n_dili <- 30
n_nodili <- 30
p <- 8
n <- n_dili + n_nodili
sample_id <- paste0("S", 1:n)
group_vec <- c(rep("DILI", n_dili), rep("No-DILI", n_nodili))
grp <- factor(group_vec, levels = c("DILI", "No-DILI"))
base_mat <- matrix(rnorm(n * p, mean = 0, sd = 0.7), nrow = n, ncol = p)
colnames(base_mat) <- paste0("F", 1:p)
rownames(base_mat) <- sample_id
delta <- c(1.1, 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4)
conc_levels <- c("Low", "Mid", "High")
effect_map <- c(Low = 0.3, Mid = 0.8, High = 1.4)
df_list <- lapply(conc_levels, function(cc) {
eff <- effect_map[cc]
X <- base_mat + matrix(rnorm(n * p, mean = 0, sd = 0.5), nrow = n, ncol = p)
X[grp == "DILI", ] <- sweep(X[grp == "DILI", ], 2, eff * delta, "+")
data.frame(
Sample = sample_id,
Group = grp,
X,
Concentration = cc,
check.names = FALSE
)
})
df_all <- bind_rows(df_list)
df_all$Concentration <- factor(df_all$Concentration, levels = c("Low", "Mid", "High"))
#========================================
# 2. Apply common scaling
# Use Low concentration as the reference
#========================================
feature_cols <- names(df_all)[3:(ncol(df_all) - 1)]
df_low <- df_all %>% filter(Concentration == "Low")
center_ref <- colMeans(df_low[, feature_cols])
scale_ref <- apply(df_low[, feature_cols], 2, sd)
scale_ref[scale_ref == 0] <- 1
df_all_scaled <- df_all
df_all_scaled[, feature_cols] <- scale(
df_all[, feature_cols],
center = center_ref,
scale = scale_ref
)
head(df_all_scaled)
## Sample Group F1 F2 F3 F4 F5
## S1...1 S1 DILI -0.39742020 0.3893341 1.1118386 -0.14244032 -1.3930019
## S2...2 S2 DILI 0.09653414 -0.4422052 -0.6092131 2.00768462 -1.5580999
## S3...3 S3 DILI 2.39543664 0.3773743 -0.1885306 1.05497285 1.4288079
## S4...4 S4 DILI 0.24564405 -1.2860802 -0.9549367 -0.49287984 -1.9570054
## S5...5 S5 DILI 0.68588627 -0.1295139 2.5278813 -1.25615473 -0.2966851
## S6...6 S6 DILI 2.17953112 0.9886829 -0.4562969 0.07433245 1.5312515
## F6 F7 F8 Concentration
## S1...1 0.07855677 -0.5347245949 0.75259189 Low
## S2...2 0.09961022 2.0169168585 -0.30710173 Low
## S3...3 -0.53398116 -0.0007498042 0.18472726 Low
## S4...4 -1.26409704 0.8641514777 1.85514168 Low
## S5...5 -0.87150598 1.0266623660 -0.04092623 Low
## S6...6 0.11050652 -0.1351705618 -0.79777671 Low
#========================================
# 3. Helper function
#========================================
calc_gini_purity <- function(cluster, grp) {
tab <- table(cluster, grp)
prop_tab <- prop.table(tab, 1)
gp_each <- apply(prop_tab, 1, function(x) sum(x^2))
weighted.mean(gp_each, w = rowSums(tab))
}
#========================================
# 4. Analysis function for one concentration
#========================================
analyze_one_df <- function(df_sub, feature_cols) {
grp_sub <- factor(df_sub$Group, levels = c("DILI", "No-DILI"))
X <- as.matrix(df_sub[, feature_cols])
rownames(X) <- df_sub$Sample
# Distance matrix
d <- dist(X, method = "euclidean")
# Hierarchical clustering
hc <- hclust(d, method = "ward.D2")
cl <- cutree(hc, k = 2)
# Gini purity
gini_purity <- calc_gini_purity(cl, grp_sub)
# Label silhouette
sil_label <- silhouette(as.integer(grp_sub), d)
label_sil <- mean(sil_label[, 3])
# PERMANOVA
meta <- data.frame(Group = grp_sub)
adonis_res <- adonis2(d ~ Group, data = meta, permutations = 999)
permanova_r2 <- unname(adonis_res$R2[1])
permanova_p <- unname(adonis_res$`Pr(>F)`[1])
# PCoA
pcoa <- cmdscale(d, k = 2, eig = TRUE)
pcoa_df <- tibble(
Sample = df_sub$Sample,
Group = grp_sub,
Concentration = df_sub$Concentration,
Axis1 = pcoa$points[, 1],
Axis2 = pcoa$points[, 2]
)
metrics <- tibble(
Concentration = as.character(df_sub$Concentration[1]),
GiniPurity = as.numeric(gini_purity),
LabelSilhouette = as.numeric(label_sil),
PERMANOVA_R2 = as.numeric(permanova_r2),
PERMANOVA_p = as.numeric(permanova_p)
)
list(
metrics = metrics,
pcoa_df = pcoa_df,
hc = hc
)
}
#========================================
# 5. Run the analysis by concentration
#========================================
split_list <- split(df_all_scaled, df_all_scaled$Concentration)
analysis_list <- lapply(split_list, function(x) analyze_one_df(x, feature_cols))
metrics_df <- bind_rows(lapply(analysis_list, `[[`, "metrics"))
pcoa_all <- bind_rows(lapply(analysis_list, `[[`, "pcoa_df"))
metrics_df$Concentration <- factor(metrics_df$Concentration, levels = c("Low", "Mid", "High"))
pcoa_all$Concentration <- factor(pcoa_all$Concentration, levels = c("Low", "Mid", "High"))
print(metrics_df)
## # A tibble: 3 × 5
## Concentration GiniPurity LabelSilhouette PERMANOVA_R2 PERMANOVA_p
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Low 0.502 0.0132 0.0296 0.068
## 2 Mid 0.542 0.0638 0.0887 0.001
## 3 High 0.75 0.243 0.291 0.001
#========================================
# 6. Plot 1: Three metrics across concentrations
#========================================
metrics_long <- metrics_df %>%
select(Concentration, GiniPurity, LabelSilhouette, PERMANOVA_R2) %>%
pivot_longer(-Concentration, names_to = "Metric", values_to = "Value")
p1 <- ggplot(metrics_long, aes(x = Concentration, y = Value, group = Metric, color = Metric)) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
theme_classic(base_size = 13) +
labs(
title = "Metrics across concentrations",
x = "Concentration",
y = "Value"
)
print(p1)

#========================================
# 7. Plot 2: PCoA
#========================================
p2 <- ggplot(pcoa_all, aes(x = Axis1, y = Axis2, color = Group)) +
geom_point(size = 2.5, alpha = 0.9) +
stat_ellipse(type = "norm", linewidth = 0.8) +
facet_wrap(~ Concentration, scales = "free") +
theme_classic(base_size = 13) +
labs(
title = "PCoA",
x = "Axis 1",
y = "Axis 2"
)
print(p2)

#========================================
# 8. Plot 3: Dendrogram using ggplot2
# Sample labels are colored by Group
#========================================
plot_dendrogram_gg <- function(hc, sub_df, title_text) {
dend <- as.dendrogram(hc)
dend_data <- ggdendro::dendro_data(dend, type = "rectangle")
seg_df <- dend_data$segments
lab_df <- dend_data$labels
lab_df$label <- as.character(lab_df$label)
sub_df$Sample <- as.character(sub_df$Sample)
sub_df$Group <- factor(sub_df$Group, levels = c("DILI", "No-DILI"))
lab_df <- lab_df %>%
left_join(
sub_df %>% select(Sample, Group),
by = c("label" = "Sample")
)
y_offset <- max(seg_df$y) * 0.03
ggplot() +
geom_segment(
data = seg_df,
aes(x = x, y = y, xend = xend, yend = yend),
linewidth = 0.4
) +
geom_text(
data = lab_df,
aes(x = x, y = -y_offset, label = label, color = Group),
angle = 90,
hjust = 1,
size = 2.6
) +
scale_color_manual(
values = c("DILI" = "tomato", "No-DILI" = "steelblue"),
drop = FALSE
) +
labs(
title = title_text,
x = NULL,
y = "Height",
color = NULL
) +
coord_cartesian(ylim = c(-max(seg_df$y) * 0.08, max(seg_df$y))) +
theme_classic(base_size = 12) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.margin = margin(10, 10, 30, 10)
)
}
p_dend_low <- plot_dendrogram_gg(analysis_list[["Low"]]$hc, split_list[["Low"]], "Dendrogram - Low")
p_dend_mid <- plot_dendrogram_gg(analysis_list[["Mid"]]$hc, split_list[["Mid"]], "Dendrogram - Mid")
p_dend_high <- plot_dendrogram_gg(analysis_list[["High"]]$hc, split_list[["High"]], "Dendrogram - High")
print(p_dend_low)

print(p_dend_mid)

print(p_dend_high)

#========================================
# 9. Interpretation
#========================================
cat("\nInterpretation:\n")
##
## Interpretation:
cat("- GiniPurity ranges from 0 to 1. Higher values indicate that sample labels are more homogeneous within clusters, whereas lower values indicate greater mixing of DILI and No-DILI samples within clusters.\n")
## - GiniPurity ranges from 0 to 1. Higher values indicate that sample labels are more homogeneous within clusters, whereas lower values indicate greater mixing of DILI and No-DILI samples within clusters.
cat("- LabelSilhouette ranges from -1 to 1. Higher values indicate better separation between DILI and No-DILI samples in the feature space, values near 0 indicate substantial overlap, and negative values indicate poor separation.\n")
## - LabelSilhouette ranges from -1 to 1. Higher values indicate better separation between DILI and No-DILI samples in the feature space, values near 0 indicate substantial overlap, and negative values indicate poor separation.
cat("- PERMANOVA_R2 ranges from 0 to 1. Higher values indicate that the group label explains a larger proportion of the overall multivariate variation. The associated PERMANOVA p-value indicates whether the group difference is statistically significant.\n")
## - PERMANOVA_R2 ranges from 0 to 1. Higher values indicate that the group label explains a larger proportion of the overall multivariate variation. The associated PERMANOVA p-value indicates whether the group difference is statistically significant.
cat("- If GiniPurity, LabelSilhouette, and PERMANOVA_R2 all increase with concentration, this suggests that the clustering structure becomes increasingly consistent with the DILI / No-DILI labels.\n")
## - If GiniPurity, LabelSilhouette, and PERMANOVA_R2 all increase with concentration, this suggests that the clustering structure becomes increasingly consistent with the DILI / No-DILI labels.