load libraries
# Data Processing
library(dplyr)
library(Seurat)
library(tibble)
library(tidyr)
library(stringr)
# Visualization
library(ggplot2)
library(ComplexHeatmap)
library(patchwork)
library(SCpubr)
# Regulatory Network Inference
library(decoupleR)
library(dorothea)
data(dorothea_hs, package = "dorothea")
library(tictoc)
Load Seurat Object
library(Seurat)
library(reticulate)
library(ggplot2)
# 1. Load Object
seurat_obj <- readRDS("temp_seurat_obj.rds")
SUPPLEMENTARY FIGURE:
UMAP Biplot Code (Correlation-Based Approach)
library(Seurat)
library(ggplot2)
library(dplyr)
library(ggrepel)
# 1. Extract UMAP Embeddings
DefaultAssay(seurat_obj) <- "dorothea"
umap_data <- Embeddings(seurat_obj, reduction = "umap")[, 1:2] %>%
as.data.frame() %>%
mutate(Cluster = as.factor(seurat_obj$seurat_clusters))
# 2. Get TF Activity Matrix
tf_matrix <- GetAssayData(seurat_obj, assay = "dorothea", layer = "data")
# 3. Compute TF Correlations with UMAP Axes
tf_cors <- data.frame(
TF = rownames(tf_matrix),
UMAP_1_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_1)),
UMAP_2_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_2))
)
# 4. Filter Top Drivers (Top 3 per quadrant)
umap1_top <- tf_cors %>% arrange(desc(UMAP_1_cor)) %>% head(3) %>% pull(TF)
umap1_bottom <- tf_cors %>% arrange(UMAP_1_cor) %>% head(3) %>% pull(TF)
umap2_top <- tf_cors %>% arrange(desc(UMAP_2_cor)) %>% head(3) %>% pull(TF)
umap2_bottom <- tf_cors %>% arrange(UMAP_2_cor) %>% head(3) %>% pull(TF)
top_tf_drivers <- unique(c(umap1_top, umap1_bottom, umap2_top, umap2_bottom))
# 5. Prepare Arrow Data
arrow_data <- tf_cors %>% filter(TF %in% top_tf_drivers)
# Scale arrows to fit nicely on the plot
scale_factor <- max(abs(umap_data$umap_1)) / max(abs(arrow_data$UMAP_1_cor)) * 0.7
arrow_data$UMAP_1 <- arrow_data$UMAP_1_cor * scale_factor
arrow_data$UMAP_2 <- arrow_data$UMAP_2_cor * scale_factor
# 6. Calculate Cluster Centroids for Labels
cluster_centroids <- umap_data %>%
group_by(Cluster) %>%
summarise(
umap_1_center = median(umap_1),
umap_2_center = median(umap_2)
)
# 7. Plot UMAP Biplot with Cluster Labels
p_umap_biplot <- ggplot(umap_data, aes(x = umap_1, y = umap_2)) +
# Points (Cells) - larger and more visible
geom_point(aes(color = Cluster), alpha = 0.7, size = 2) +
# Cluster Labels (at centroids)
geom_text(data = cluster_centroids,
aes(x = umap_1_center, y = umap_2_center, label = Cluster),
fontface = "bold", size = 6, color = "black") +
# Arrows (TFs) - from center of UMAP
geom_segment(data = arrow_data,
aes(x = mean(umap_data$umap_1), y = mean(umap_data$umap_2),
xend = mean(umap_data$umap_1) + UMAP_1,
yend = mean(umap_data$umap_2) + UMAP_2),
arrow = arrow(length = unit(0.25, "cm")),
color = "darkred", linewidth = 1) +
# TF Labels (using ggrepel to avoid overlap with cluster labels)
geom_text_repel(data = arrow_data,
aes(x = mean(umap_data$umap_1) + UMAP_1,
y = mean(umap_data$umap_2) + UMAP_2,
label = TF),
fontface = "bold.italic", color = "darkred", size = 4.5,
box.padding = 0.6, point.padding = 0.4,
min.segment.length = 0.2) +
# Styling
scale_color_manual(values = Seurat::DiscretePalette(14)) +
labs(title = "Regulatory Drivers in UMAP Space",
subtitle = "Cluster numbers shown at centroids; TF arrows indicate regulatory associations",
x = "UMAP_1",
y = "UMAP_2") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "right",
panel.grid.minor = element_blank()
)
# Save
ggsave("Output_Figures/Supplementary_TF_UMAP_Biplot.pdf", p_umap_biplot, width = 12, height = 9)
print(p_umap_biplot)

Check marker expression
to validate root choice
library(Seurat)
library(ggplot2)
library(dplyr)
library(pheatmap)
# Make sure SCT is default
DefaultAssay(seurat_obj) <- "SCT"
# SƩzary-specific markers
sezary_markers <- list(
early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
malignant = c("TOX", "TWIST1", "DNM3", "STAT5A"),
exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3"),
proliferative = c("MKI67", "TOP2A", "PCNA")
)
all_markers <- unlist(sezary_markers)
# ============================================
# 1. DOT PLOT - ALL CLUSTERS
# ============================================
p_dot <- DotPlot(seurat_obj,
features = sezary_markers,
assay = "SCT", # Explicitly specify assay
cols = c("lightgrey", "red"),
dot.scale = 6) +
coord_flip() +
ggtitle("SƩzary Markers Across All Clusters") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.text.y = element_text(size = 10)) +
labs(x = "Markers", y = "Clusters")
print(p_dot)
ggsave("Output_Figures/Sezary_Markers_DotPlot_AllClusters.pdf", p_dot, width = 14, height = 8)
# ============================================
# 2. AVERAGE EXPRESSION (FIXED)
# ============================================
# Use AggregateExpression (recommended in Seurat v5) with correct assay
avg_exp <- AggregateExpression(seurat_obj,
features = all_markers,
assays = "SCT", # Specify SCT only
group.by = "seurat_clusters",
slot = "data",
return.seurat = FALSE)
# Extract the SCT matrix
expr_matrix <- as.matrix(avg_exp$SCT)
# Check dimensions
cat("Expression matrix dimensions:", nrow(expr_matrix), "genes x", ncol(expr_matrix), "clusters\n")
Expression matrix dimensions: 15 genes x 14 clusters
print(head(expr_matrix))
g0 g1 g2 g3 g4 g5 g6 g7 g8 g9 g10 g11 g12 g13
CCR7 14443 1144 11455 17619 12865 5533 9463 4391 7776 5016 8035 3217 4113 8053
SELL 1558 145 657 17789 753 3982 277 2210 1287 2874 9643 1256 409 115
TCF7 125 3808 341 19510 52 3758 402 1011 50 2130 8873 368 207 114
LEF1 687 9412 761 14007 169 9625 771 1806 206 4801 5217 962 326 68
TOX 2074 14642 984 132 784 2792 373 1300 2539 1715 491 675 411 157
TWIST1 3668 3259 2324 0 3893 212 1720 1276 2535 559 28 674 466 496
# ============================================
# 3. HEATMAP - Scaled Expression
# ============================================
# Scale by row (gene) for better visualization
expr_matrix_scaled <- t(scale(t(expr_matrix)))
# Annotation for marker categories
annotation_row <- data.frame(
Category = rep(names(sezary_markers), times = sapply(sezary_markers, length)),
row.names = all_markers
)
annotation_colors <- list(
Category = c(early_memory = "#4DAF4A",
malignant = "#E41A1C",
exhausted = "#377EB8",
proliferative = "#FF7F00")
)
pdf("Output_Figures/Sezary_Markers_Heatmap_AllClusters.pdf", width = 12, height = 8)

pheatmap(expr_matrix_scaled,
cluster_rows = FALSE,
cluster_cols = TRUE,
annotation_row = annotation_row,
annotation_colors = annotation_colors,
color = colorRampPalette(c("navy", "white", "firebrick"))(100),
main = "SƩzary Markers - All Clusters (Scaled)",
fontsize = 10,
angle_col = 0)
dev.off()
pdf
3

# ============================================
# 4. CALCULATE CATEGORY SCORES PER CLUSTER
# ============================================
category_scores <- data.frame()
for(category in names(sezary_markers)) {
markers_in_category <- sezary_markers[[category]]
# Calculate mean expression per cluster for this category
category_mean <- colMeans(expr_matrix[markers_in_category, , drop = FALSE])
temp_df <- data.frame(
Cluster = names(category_mean),
Category = category,
Score = as.numeric(category_mean)
)
category_scores <- rbind(category_scores, temp_df)
}
# Reshape for easier analysis
library(tidyr)
category_scores_wide <- category_scores %>%
pivot_wider(names_from = Category, values_from = Score)
# Remove 'g' prefix if present (Seurat adds this)
category_scores_wide$Cluster <- gsub("^g", "", category_scores_wide$Cluster)
cat("\n=== CATEGORY SCORES BY CLUSTER ===\n")
=== CATEGORY SCORES BY CLUSTER ===
print(category_scores_wide)
# A tibble: 14 Ć 5
Cluster early_memory malignant exhausted proliferative
<chr> <dbl> <dbl> <dbl> <dbl>
1 0 4203. 2576. 382. 16194.
2 1 3627. 4727. 101. 21402.
3 2 3304. 1559 3609. 5791.
4 3 17231. 397. 146. 215.
5 4 3460. 1658. 133 10292
6 5 5724. 1007. 173 6241
7 6 2728. 1211. 2705. 8700.
8 7 2354. 1136. 306. 15908.
9 8 2330. 1832. 4309 12281.
10 9 3705. 720. 220 2966
11 10 7942 390. 442. 272.
12 11 1451. 510. 575 1660.
13 12 1264. 608. 388. 2580.
14 13 2088. 354. 265. 1797.
# ============================================
# 5. IDENTIFY CLUSTER PHENOTYPES
# ============================================
cat("\n=== CLUSTER PHENOTYPE CLASSIFICATION ===\n")
=== CLUSTER PHENOTYPE CLASSIFICATION ===
# Rank clusters by each category
rankings <- category_scores_wide %>%
mutate(
early_memory_rank = rank(-early_memory),
malignant_rank = rank(-malignant),
exhausted_rank = rank(-exhausted),
proliferative_rank = rank(-proliferative)
)
# Identify top 3 clusters per category
cat("\nā Top 3 STEM-LIKE clusters (Early Memory):\n")
ā Top 3 STEM-LIKE clusters (Early Memory):
top_memory <- rankings %>% arrange(early_memory_rank) %>% head(3) %>%
select(Cluster, early_memory, early_memory_rank)
print(top_memory)
# A tibble: 3 Ć 3
Cluster early_memory early_memory_rank
<chr> <dbl> <dbl>
1 3 17231. 1
2 10 7942 2
3 5 5724. 3
cat("\nā Top 3 MALIGNANT clusters:\n")
ā Top 3 MALIGNANT clusters:
top_malignant <- rankings %>% arrange(malignant_rank) %>% head(3) %>%
select(Cluster, malignant, malignant_rank)
print(top_malignant)
# A tibble: 3 Ć 3
Cluster malignant malignant_rank
<chr> <dbl> <dbl>
1 1 4727. 1
2 0 2576. 2
3 8 1832. 3
cat("\nā Top 3 EXHAUSTED/TERMINAL clusters:\n")
ā Top 3 EXHAUSTED/TERMINAL clusters:
top_exhausted <- rankings %>% arrange(exhausted_rank) %>% head(3) %>%
select(Cluster, exhausted, exhausted_rank)
print(top_exhausted)
# A tibble: 3 Ć 3
Cluster exhausted exhausted_rank
<chr> <dbl> <dbl>
1 8 4309 1
2 2 3609. 2
3 6 2705. 3
cat("\nā Top 3 PROLIFERATIVE clusters:\n")
ā Top 3 PROLIFERATIVE clusters:
top_prolif <- rankings %>% arrange(proliferative_rank) %>% head(3) %>%
select(Cluster, proliferative, proliferative_rank)
print(top_prolif)
# A tibble: 3 Ć 3
Cluster proliferative proliferative_rank
<chr> <dbl> <dbl>
1 1 21402. 1
2 0 16194. 2
3 7 15908. 3
# ============================================
# 6. BARPLOT - Category Scores
# ============================================
p_bar <- ggplot(category_scores, aes(x = gsub("^g", "", Cluster), y = Score, fill = Category)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c(early_memory = "#4DAF4A",
malignant = "#E41A1C",
exhausted = "#377EB8",
proliferative = "#FF7F00")) +
labs(title = "Functional Signatures Across All Clusters",
x = "Cluster", y = "Mean Expression Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", size = 14),
legend.position = "right")
print(p_bar)
ggsave("Output_Figures/Sezary_CategoryScores_Barplot.pdf", p_bar, width = 14, height = 6)
# ============================================
# 7. TRAJECTORY ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY INFERENCE RECOMMENDATION ===\n")
=== TRAJECTORY INFERENCE RECOMMENDATION ===
trajectory_guide <- category_scores_wide %>%
mutate(
Stemness_Score = early_memory,
Differentiation_Score = (exhausted + malignant) / 2,
Balance = Stemness_Score - Differentiation_Score
) %>%
arrange(desc(Balance))
cat("\nClusters ranked by Stemness Score:\n")
Clusters ranked by Stemness Score:
print(trajectory_guide %>% select(Cluster, Stemness_Score, Differentiation_Score, Balance))
# A tibble: 14 Ć 4
Cluster Stemness_Score Differentiation_Score Balance
<chr> <dbl> <dbl> <dbl>
1 3 17231. 271. 16960
2 10 7942 416 7526
3 5 5724. 590. 5135.
4 9 3705. 470. 3236.
5 0 4203. 1479. 2724.
6 4 3460. 895. 2564.
7 13 2088. 310. 1778.
8 7 2354. 721. 1634.
9 1 3627. 2414 1213.
10 11 1451. 543. 908.
11 6 2728. 1958. 770
12 12 1264. 498. 766.
13 2 3304. 2584. 720.
14 8 2330. 3070. -740.
cat("\nāāā RECOMMENDED ROOT CLUSTER: ",
trajectory_guide$Cluster[1],
" (highest stemness, lowest differentiation)\n\n", sep = "")
āāā RECOMMENDED ROOT CLUSTER: 3 (highest stemness, lowest differentiation)
cat("ā POTENTIAL TERMINAL CLUSTERS: ",
paste(tail(trajectory_guide$Cluster, 3), collapse = ", "),
" (high exhaustion/malignancy)\n\n", sep = "")
ā POTENTIAL TERMINAL CLUSTERS: 12, 2, 8 (high exhaustion/malignancy)
# Save summary tables
write.csv(category_scores_wide, "Output_Figures/Cluster_Phenotype_Scores.csv", row.names = FALSE)
write.csv(trajectory_guide, "Output_Figures/Trajectory_Root_Recommendations.csv", row.names = FALSE)
cat("ā Analysis complete! Tables saved to Output_Figures/\n")
ā Analysis complete! Tables saved to Output_Figures/
ROBUST Entropy Analysis
(No External Dependencies)
library(Seurat)
library(ggplot2)
library(dplyr)
library(patchwork)
DefaultAssay(seurat_obj) <- "SCT"
cat("Computing entropy-based stemness metrics...\n")
Computing entropy-based stemness metrics...
# ============================================
# Function: Shannon Entropy Calculation
# ============================================
calculate_shannon_entropy <- function(expression_vector) {
expr_nonzero <- expression_vector[expression_vector > 0]
if(length(expr_nonzero) == 0) return(0)
probs <- expr_nonzero / sum(expr_nonzero)
entropy <- -sum(probs * log2(probs))
return(entropy)
}
# ============================================
# Function: Gini Coefficient (Specialization)
# ============================================
calculate_gini <- function(x) {
x <- x[x > 0]
if(length(x) == 0) return(0)
n <- length(x)
x_sorted <- sort(x)
gini <- (2 * sum((1:n) * x_sorted)) / (n * sum(x_sorted)) - (n + 1) / n
return(gini)
}
# ============================================
# STEP 1: Calculate Entropy Per Cell
# ============================================
expr_matrix <- GetAssayData(seurat_obj, assay = "SCT", layer = "data")
hvgs <- VariableFeatures(seurat_obj)
expr_hvg <- expr_matrix[hvgs, ]
cat("Calculating entropy for", ncol(expr_hvg), "cells using", length(hvgs), "HVGs...\n")
Calculating entropy for 49305 cells using 3000 HVGs...
# Shannon Entropy
entropy_scores <- apply(expr_hvg, 2, calculate_shannon_entropy)
seurat_obj$Shannon_Entropy <- entropy_scores
# Gini Index
gini_scores <- apply(expr_hvg, 2, calculate_gini)
seurat_obj$Gini_Index <- gini_scores
# Transcriptional Diversity Score
seurat_obj$Diversity_Score <- seurat_obj$nFeature_RNA / max(seurat_obj$nFeature_RNA)
# ============================================
# STEP 2: Combined Stemness Score
# ============================================
# Normalize and combine metrics
seurat_obj$Stemness_Score <-
scales::rescale(seurat_obj$Shannon_Entropy, to = c(0, 1)) * 0.4 +
(1 - scales::rescale(seurat_obj$Gini_Index, to = c(0, 1))) * 0.3 + # Invert Gini
seurat_obj$Diversity_Score * 0.3
# ============================================
# STEP 3: Per-Cluster Analysis
# ============================================
cluster_potency <- seurat_obj@meta.data %>%
group_by(seurat_clusters) %>%
summarize(
n_cells = n(),
mean_entropy = mean(Shannon_Entropy),
mean_gini = mean(Gini_Index),
mean_diversity = mean(Diversity_Score),
mean_stemness = mean(Stemness_Score),
sd_stemness = sd(Stemness_Score)
) %>%
arrange(desc(mean_stemness))
cat("\n=== STEMNESS/POTENCY SCORES BY CLUSTER ===\n")
=== STEMNESS/POTENCY SCORES BY CLUSTER ===
print(cluster_potency)
# A tibble: 14 Ć 7
seurat_clusters n_cells mean_entropy mean_gini mean_diversity mean_stemness sd_stemness
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 8 3338 9.92 0.318 0.654 0.587 0.0610
2 12 1063 9.74 0.317 0.586 0.536 0.0958
3 1 5275 9.75 0.324 0.609 0.535 0.0642
4 13 691 9.70 0.319 0.524 0.508 0.0750
5 6 3536 9.67 0.316 0.498 0.498 0.0637
6 0 6789 9.66 0.323 0.529 0.496 0.0651
7 7 3409 9.67 0.312 0.472 0.495 0.0693
8 4 4086 9.66 0.328 0.540 0.494 0.0668
9 5 3634 9.59 0.311 0.480 0.485 0.0824
10 2 4663 9.56 0.323 0.467 0.459 0.0709
11 11 1675 9.55 0.306 0.352 0.446 0.0692
12 9 3273 9.33 0.332 0.356 0.375 0.0544
13 10 3212 9.15 0.291 0.281 0.374 0.0785
14 3 4661 8.98 0.304 0.247 0.319 0.0669
# ============================================
# VISUALIZATION 1: Stemness Score Barplot
# ============================================
p1 <- ggplot(cluster_potency,
aes(x = reorder(seurat_clusters, -mean_stemness),
y = mean_stemness,
fill = mean_stemness)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = mean_stemness - sd_stemness,
ymax = mean_stemness + sd_stemness),
width = 0.3, alpha = 0.5) +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Entropy-Based Stemness Score by Cluster",
subtitle = "Combined metric: Shannon entropy + Gini + Diversity",
x = "Cluster", y = "Mean Stemness Score",
fill = "Stemness") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14))
print(p1)

# ============================================
# VISUALIZATION 2: Stemness on UMAP
# ============================================
p2 <- FeaturePlot(seurat_obj,
features = "Stemness_Score",
reduction = "umap",
pt.size = 1.5) +
scale_color_gradient(low = "blue", high = "red") +
labs(title = "Stemness Score in UMAP Space",
subtitle = "Red = High stemness/potency")
print(p2)

# ============================================
# VISUALIZATION 3: Component Scores Heatmap
# ============================================
library(reshape2)
heatmap_data <- cluster_potency %>%
select(seurat_clusters, mean_entropy, mean_gini, mean_diversity) %>%
mutate(mean_gini = 1 - mean_gini) %>% # Invert for consistency
mutate(across(c(mean_entropy, mean_gini, mean_diversity),
~scales::rescale(.x, to = c(0, 1)))) %>%
melt(id.vars = "seurat_clusters")
p3 <- ggplot(heatmap_data, aes(x = variable, y = seurat_clusters, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0.5) +
labs(title = "Component Metrics of Stemness",
x = "Metric", y = "Cluster", fill = "Normalized\nScore") +
scale_x_discrete(labels = c("Entropy", "Specialization", "Diversity")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p3)

# ============================================
# VISUALIZATION 4: Violin Plot
# ============================================
p4 <- VlnPlot(seurat_obj,
features = "Stemness_Score",
pt.size = 0) +
labs(title = "Stemness Score Distribution",
x = "Cluster", y = "Stemness Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p4)

# ============================================
# INTEGRATE WITH SĆZARY MARKERS
# ============================================
sezary_markers <- list(
early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3")
)
seurat_obj <- AddModuleScore(seurat_obj,
features = sezary_markers,
name = c("EarlyMemory", "Exhausted"))
cor_memory <- cor(seurat_obj$Stemness_Score, seurat_obj$EarlyMemory1, use = "complete.obs")
cor_exhausted <- cor(seurat_obj$Stemness_Score, seurat_obj$Exhausted2, use = "complete.obs")
cat("\n=== VALIDATION AGAINST MARKER GENES ===\n")
=== VALIDATION AGAINST MARKER GENES ===
cat("Correlation Stemness vs Early Memory:", round(cor_memory, 3), "\n")
Correlation Stemness vs Early Memory: -0.583
cat("Correlation Stemness vs Exhaustion:", round(cor_exhausted, 3), "\n")
Correlation Stemness vs Exhaustion: 0.063
p5 <- ggplot(seurat_obj@meta.data,
aes(x = Stemness_Score, y = EarlyMemory1, color = seurat_clusters)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "black") +
labs(title = "Stemness Score vs Early Memory Markers",
subtitle = paste0("r = ", round(cor_memory, 3)),
x = "Stemness Score", y = "Early Memory Score") +
theme_minimal()
print(p5)

# ============================================
# ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY ROOT RECOMMENDATION ===\n\n")
=== TRAJECTORY ROOT RECOMMENDATION ===
cat("āāā HIGHEST STEMNESS/POTENCY CLUSTER: ", cluster_potency$seurat_clusters, "\n")[1]
āāā HIGHEST STEMNESS/POTENCY CLUSTER: 9 13 2 14 7 1 8 5 6 3 12 10 11 4
NULL
cat(" Stemness Score: ", round(cluster_potency$mean_stemness, 3), "\n\n")[1]
Stemness Score: 0.587 0.536 0.535 0.508 0.498 0.496 0.495 0.494 0.485 0.459 0.446 0.375 0.374 0.319
NULL
cat("ā Top 3 stem-like clusters:\n")
ā Top 3 stem-like clusters:
print(cluster_potency %>% head(3) %>% select(seurat_clusters, mean_stemness))
# A tibble: 3 Ć 2
seurat_clusters mean_stemness
<fct> <dbl>
1 8 0.587
2 12 0.536
3 1 0.535
cat("\nā Top 3 differentiated clusters:\n")
ā Top 3 differentiated clusters:
print(cluster_potency %>% tail(3) %>% select(seurat_clusters, mean_stemness))
# A tibble: 3 Ć 2
seurat_clusters mean_stemness
<fct> <dbl>
1 9 0.375
2 10 0.374
3 3 0.319
# ============================================
# SAVE RESULTS
# ============================================
write.csv(cluster_potency,
"Output_Figures/Stemness_Potency_ByCluster.csv",
row.names = FALSE)
# Combined figure
p_combined <- (p1 | p2) / (p3 | p4) +
plot_annotation(title = "Entropy-Based Stemness Analysis",
subtitle = "Root-free differentiation state assessment",
theme = theme(plot.title = element_text(size = 16, face = "bold")))
print(p_combined)

ggsave("Output_Figures/Stemness_Analysis_Complete.pdf", p_combined,
width = 16, height = 12)
cat("\nā Analysis complete! All results saved to Output_Figures/\n")
ā Analysis complete! All results saved to Output_Figures/
---
title: "PAGA using TF Activity"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# load libraries
```{r setup, include=TRUE}
# Data Processing
library(dplyr)
library(Seurat)
library(tibble)
library(tidyr)
library(stringr)

# Visualization
library(ggplot2)
library(ComplexHeatmap)
library(patchwork)
library(SCpubr)

# Regulatory Network Inference
library(decoupleR)
library(dorothea)
data(dorothea_hs, package = "dorothea")
library(tictoc)


```

# Load Seurat Object 
```{r}
library(Seurat)
library(reticulate)
library(ggplot2)

# 1. Load Object
seurat_obj <- readRDS("temp_seurat_obj.rds")



```

# SUPPLEMENTARY FIGURE: PAGA Trajectory Analysis (via Reticulate/Scanpy)
```{r, fig.height=16, fig.width=16}
library(Seurat)
library(reticulate)
library(ggplot2)

# 1. FORCE USE of the specific python binary where scanpy is installed
# This overrides the default "auto" discovery which is failing you
Sys.setenv(RETICULATE_PYTHON = "/home/bioinfo/.virtualenvs/r-reticulate/bin/python")
use_python("/home/bioinfo/.virtualenvs/r-reticulate/bin/python", required = TRUE)

# 2. Verify it's working (Optional but good for debugging)
cat("Using Python:", py_config()$python, "\n")

# 3. Import Libraries
sc <- import("scanpy")
ad <- import("anndata")

cat("✓ Python libraries imported successfully!\n")

# 4. Prepare Data for PAGA
cat("Converting Seurat object to AnnData...\n")
tf_matrix <- t(GetAssayData(seurat_obj, assay = "dorothea", layer = "data"))
obs <- data.frame(row.names = colnames(seurat_obj), 
                  louvain = as.character(seurat_obj$seurat_clusters))

adata <- ad$AnnData(X = tf_matrix, obs = obs)

# 5. Run Scanpy Workflow
cat("Running PAGA...\n")
sc$pp$neighbors(adata, use_rep = "X", n_neighbors = 15L)
sc$tl$paga(adata, groups = "louvain")

# 6. Plot and Save
output_file <- "Output_Figures/Supplementary_PAGA_TF_Activity.png"
if(!dir.exists("Output_Figures")) dir.create("Output_Figures")

# CORRECT ORDER: Draw PAGA first to compute positions
sc$pl$paga(adata, threshold = 0.15, show = FALSE)

# Now UMAP can use those positions
sc$tl$umap(adata, init_pos = "paga")

# Plot PAGA with connectivity threshold
sc$pl$paga(adata, threshold = 0.15, show = FALSE, save = "_TF_Activity.png")

# Scanpy saves to "figures/" by default
if(file.exists("figures/paga_TF_Activity.png")){
  file.rename("figures/paga_TF_Activity.png", output_file)
  cat(sprintf("✓ PAGA plot saved to: %s\n", output_file))
} else {
  cat("Note: Check './figures/' directory for output\n")
}

# Display in notebook
if(file.exists(output_file)){
  knitr::include_graphics(output_file)
}

```

# SUPPLEMENTARY FIGURE: PAGA on Gene Expression (SCT Assay)
```{r, fig.height=16, fig.width=16}
library(Seurat)
library(reticulate)

# Use same python environment
Sys.setenv(RETICULATE_PYTHON = "/home/bioinfo/.virtualenvs/r-reticulate/bin/python")
use_python("/home/bioinfo/.virtualenvs/r-reticulate/bin/python", required = TRUE)

sc <- import("scanpy")
ad <- import("anndata")

cat("✓ Python libraries imported\n")

# 1. Extract SCT-normalized gene expression
DefaultAssay(seurat_obj) <- "SCT"

# Use highly variable features from SCT
hvgs <- VariableFeatures(seurat_obj)

# Get normalized data (log1p of corrected counts)
gene_matrix <- t(GetAssayData(seurat_obj, assay = "SCT", layer = "data")[hvgs, ])

cat(sprintf("Using %d highly variable genes from SCT assay\n", length(hvgs)))

# 2. Prepare AnnData object
obs <- data.frame(
  row.names = colnames(seurat_obj),
  louvain = as.character(seurat_obj$seurat_clusters)
)

adata_sct <- ad$AnnData(X = gene_matrix, obs = obs)

# 3. Run PAGA workflow
cat("Computing neighbors and PAGA on SCT-normalized expression...\n")
sc$pp$neighbors(adata_sct, use_rep = "X", n_neighbors = 15L)
sc$tl$paga(adata_sct, groups = "louvain")

# 4. Plot and save
output_file_sct <- "Output_Figures/Supplementary_PAGA_GeneExpression_SCT.png"

# Draw PAGA first to compute positions
sc$pl$paga(adata_sct, threshold = 0.15, show = FALSE)

# Compute UMAP initialized by PAGA
sc$tl$umap(adata_sct, init_pos = "paga")

# Final PAGA plot
sc$pl$paga(adata_sct, threshold = 0.15, show = FALSE, save = "_GeneExpression_SCT.png")

# Move from figures/ to Output_Figures/
if(file.exists("figures/paga_GeneExpression_SCT.png")){
  file.rename("figures/paga_GeneExpression_SCT.png", output_file_sct)
  cat(sprintf("✓ SCT Gene Expression PAGA saved to: %s\n", output_file_sct))
}

# Display
if(file.exists(output_file_sct)){
  knitr::include_graphics(output_file_sct)
}
```
# SUPPLEMENTARY FIGURE: Quick Code to Generate Malignant-Only PAGA
```{r paga_analysis_SCT, fig.height=16, fig.width=16}
# Run PAGA on malignant cells only
library(reticulate)
library(Seurat)

# 1. Subset to malignant cells
malignant_clusters <- setdiff(unique(seurat_obj$seurat_clusters), c(3, 10))
seurat_malignant <- subset(seurat_obj, subset = seurat_clusters %in% malignant_clusters)

# 2. Prepare for PAGA
sc <- import("scanpy")
ad <- import("anndata")

# Get expression matrix and metadata (malignant only)
expr_matrix <- t(GetAssayData(seurat_malignant, assay = "SCT", layer = "data"))
obs <- data.frame(
  row.names = colnames(seurat_malignant),
  louvain = as.character(seurat_malignant$seurat_clusters)
)

adata_malignant <- ad$AnnData(X = expr_matrix, obs = obs)

# 3. Run PAGA
sc$pp$neighbors(adata_malignant, use_rep = "X", n_neighbors = 15L)
sc$tl$paga(adata_malignant, groups = "louvain")

# 4. Plot
sc$pl$paga(adata_malignant, threshold = 0.15, show = FALSE)

# Compute UMAP initialized by PAGA
sc$tl$umap(adata_malignant, init_pos = "paga")

# Save figure
plt <- import("matplotlib.pyplot")
sc$pl$paga(adata_malignant, threshold = 0.15, show = FALSE)  # Draw the plot
plt$savefig("Output_Figures/PAGA_Malignant_Only.png", dpi = 300L, bbox_inches = "tight")
plt$close()

cat("✓ Malignant-only PAGA saved to Output_Figures/PAGA_Malignant_Only.png\n")


output_file <- "Output_Figures/PAGA_Malignant_Only.png"

if (file.exists(output_file)) {
  knitr::include_graphics(output_file)
}

```

# SUPPLEMENTARY FIGURE: UMAP Biplot Code (Correlation-Based Approach)
```{r , fig.height=16, fig.width=16}
library(Seurat)
library(ggplot2)
library(dplyr)
library(ggrepel)

# 1. Extract UMAP Embeddings
DefaultAssay(seurat_obj) <- "dorothea"
umap_data <- Embeddings(seurat_obj, reduction = "umap")[, 1:2] %>% 
  as.data.frame() %>% 
  mutate(Cluster = as.factor(seurat_obj$seurat_clusters))

# 2. Get TF Activity Matrix
tf_matrix <- GetAssayData(seurat_obj, assay = "dorothea", layer = "data")

# 3. Compute TF Correlations with UMAP Axes
tf_cors <- data.frame(
  TF = rownames(tf_matrix),
  UMAP_1_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_1)),
  UMAP_2_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_2))
)

# 4. Filter Top Drivers (Top 3 per quadrant)
umap1_top <- tf_cors %>% arrange(desc(UMAP_1_cor)) %>% head(3) %>% pull(TF)
umap1_bottom <- tf_cors %>% arrange(UMAP_1_cor) %>% head(3) %>% pull(TF)
umap2_top <- tf_cors %>% arrange(desc(UMAP_2_cor)) %>% head(3) %>% pull(TF)
umap2_bottom <- tf_cors %>% arrange(UMAP_2_cor) %>% head(3) %>% pull(TF)

top_tf_drivers <- unique(c(umap1_top, umap1_bottom, umap2_top, umap2_bottom))

# 5. Prepare Arrow Data
arrow_data <- tf_cors %>% filter(TF %in% top_tf_drivers)

# Scale arrows to fit nicely on the plot
scale_factor <- max(abs(umap_data$umap_1)) / max(abs(arrow_data$UMAP_1_cor)) * 0.7
arrow_data$UMAP_1 <- arrow_data$UMAP_1_cor * scale_factor
arrow_data$UMAP_2 <- arrow_data$UMAP_2_cor * scale_factor

# 6. Calculate Cluster Centroids for Labels
cluster_centroids <- umap_data %>%
  group_by(Cluster) %>%
  summarise(
    umap_1_center = median(umap_1),
    umap_2_center = median(umap_2)
  )

# 7. Plot UMAP Biplot with Cluster Labels
p_umap_biplot <- ggplot(umap_data, aes(x = umap_1, y = umap_2)) +
  # Points (Cells) - larger and more visible
  geom_point(aes(color = Cluster), alpha = 0.7, size = 2) +
  
  # Cluster Labels (at centroids)
  geom_text(data = cluster_centroids, 
            aes(x = umap_1_center, y = umap_2_center, label = Cluster),
            fontface = "bold", size = 6, color = "black") +
  
  # Arrows (TFs) - from center of UMAP
  geom_segment(data = arrow_data, 
               aes(x = mean(umap_data$umap_1), y = mean(umap_data$umap_2), 
                   xend = mean(umap_data$umap_1) + UMAP_1, 
                   yend = mean(umap_data$umap_2) + UMAP_2),
               arrow = arrow(length = unit(0.25, "cm")), 
               color = "darkred", linewidth = 1) +
  
  # TF Labels (using ggrepel to avoid overlap with cluster labels)
  geom_text_repel(data = arrow_data, 
                  aes(x = mean(umap_data$umap_1) + UMAP_1, 
                      y = mean(umap_data$umap_2) + UMAP_2, 
                      label = TF),
                  fontface = "bold.italic", color = "darkred", size = 4.5, 
                  box.padding = 0.6, point.padding = 0.4,
                  min.segment.length = 0.2) +
  
  # Styling
  scale_color_manual(values = Seurat::DiscretePalette(14)) +
  labs(title = "Regulatory Drivers in UMAP Space",
       subtitle = "Cluster numbers shown at centroids; TF arrows indicate regulatory associations",
       x = "UMAP_1", 
       y = "UMAP_2") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right",
    panel.grid.minor = element_blank()
  )

# Save
ggsave("Output_Figures/Supplementary_TF_UMAP_Biplot.pdf", p_umap_biplot, width = 12, height = 9)
print(p_umap_biplot)
```



# Check marker expression to validate root choice
```{r , fig.height=16, fig.width=16}
library(Seurat)
library(ggplot2)
library(dplyr)
library(pheatmap)

# Make sure SCT is default
DefaultAssay(seurat_obj) <- "SCT"

# Sézary-specific markers
sezary_markers <- list(
  early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
  malignant = c("TOX", "TWIST1", "DNM3", "STAT5A"),
  exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3"),
  proliferative = c("MKI67", "TOP2A", "PCNA")
)

all_markers <- unlist(sezary_markers)

# ============================================
# 1. DOT PLOT - ALL CLUSTERS
# ============================================
p_dot <- DotPlot(seurat_obj, 
                 features = sezary_markers,
                 assay = "SCT",  # Explicitly specify assay
                 cols = c("lightgrey", "red"),
                 dot.scale = 6) +
  coord_flip() +
  ggtitle("Sézary Markers Across All Clusters") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
        axis.text.y = element_text(size = 10)) +
  labs(x = "Markers", y = "Clusters")

print(p_dot)
ggsave("Output_Figures/Sezary_Markers_DotPlot_AllClusters.pdf", p_dot, width = 14, height = 8)

# ============================================
# 2. AVERAGE EXPRESSION (FIXED)
# ============================================
# Use AggregateExpression (recommended in Seurat v5) with correct assay
avg_exp <- AggregateExpression(seurat_obj, 
                                features = all_markers,
                                assays = "SCT",  # Specify SCT only
                                group.by = "seurat_clusters",
                                slot = "data",
                                return.seurat = FALSE)

# Extract the SCT matrix
expr_matrix <- as.matrix(avg_exp$SCT)

# Check dimensions
cat("Expression matrix dimensions:", nrow(expr_matrix), "genes x", ncol(expr_matrix), "clusters\n")
print(head(expr_matrix))

# ============================================
# 3. HEATMAP - Scaled Expression
# ============================================
# Scale by row (gene) for better visualization
expr_matrix_scaled <- t(scale(t(expr_matrix)))

# Annotation for marker categories
annotation_row <- data.frame(
  Category = rep(names(sezary_markers), times = sapply(sezary_markers, length)),
  row.names = all_markers
)

annotation_colors <- list(
  Category = c(early_memory = "#4DAF4A", 
               malignant = "#E41A1C",
               exhausted = "#377EB8",
               proliferative = "#FF7F00")
)

pdf("Output_Figures/Sezary_Markers_Heatmap_AllClusters.pdf", width = 12, height = 8)
pheatmap(expr_matrix_scaled,
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         annotation_row = annotation_row,
         annotation_colors = annotation_colors,
         color = colorRampPalette(c("navy", "white", "firebrick"))(100),
         main = "Sézary Markers - All Clusters (Scaled)",
         fontsize = 10,
         angle_col = 0)
dev.off()

# ============================================
# 4. CALCULATE CATEGORY SCORES PER CLUSTER
# ============================================
category_scores <- data.frame()

for(category in names(sezary_markers)) {
  markers_in_category <- sezary_markers[[category]]
  
  # Calculate mean expression per cluster for this category
  category_mean <- colMeans(expr_matrix[markers_in_category, , drop = FALSE])
  
  temp_df <- data.frame(
    Cluster = names(category_mean),
    Category = category,
    Score = as.numeric(category_mean)
  )
  
  category_scores <- rbind(category_scores, temp_df)
}

# Reshape for easier analysis
library(tidyr)
category_scores_wide <- category_scores %>%
  pivot_wider(names_from = Category, values_from = Score)

# Remove 'g' prefix if present (Seurat adds this)
category_scores_wide$Cluster <- gsub("^g", "", category_scores_wide$Cluster)

cat("\n=== CATEGORY SCORES BY CLUSTER ===\n")
print(category_scores_wide)

# ============================================
# 5. IDENTIFY CLUSTER PHENOTYPES
# ============================================
cat("\n=== CLUSTER PHENOTYPE CLASSIFICATION ===\n")

# Rank clusters by each category
rankings <- category_scores_wide %>%
  mutate(
    early_memory_rank = rank(-early_memory),
    malignant_rank = rank(-malignant),
    exhausted_rank = rank(-exhausted),
    proliferative_rank = rank(-proliferative)
  )

# Identify top 3 clusters per category
cat("\n✓ Top 3 STEM-LIKE clusters (Early Memory):\n")
top_memory <- rankings %>% arrange(early_memory_rank) %>% head(3) %>% 
  select(Cluster, early_memory, early_memory_rank)
print(top_memory)

cat("\n✓ Top 3 MALIGNANT clusters:\n")
top_malignant <- rankings %>% arrange(malignant_rank) %>% head(3) %>% 
  select(Cluster, malignant, malignant_rank)
print(top_malignant)

cat("\n✓ Top 3 EXHAUSTED/TERMINAL clusters:\n")
top_exhausted <- rankings %>% arrange(exhausted_rank) %>% head(3) %>% 
  select(Cluster, exhausted, exhausted_rank)
print(top_exhausted)

cat("\n✓ Top 3 PROLIFERATIVE clusters:\n")
top_prolif <- rankings %>% arrange(proliferative_rank) %>% head(3) %>% 
  select(Cluster, proliferative, proliferative_rank)
print(top_prolif)

# ============================================
# 6. BARPLOT - Category Scores
# ============================================
p_bar <- ggplot(category_scores, aes(x = gsub("^g", "", Cluster), y = Score, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c(early_memory = "#4DAF4A", 
                                malignant = "#E41A1C",
                                exhausted = "#377EB8",
                                proliferative = "#FF7F00")) +
  labs(title = "Functional Signatures Across All Clusters",
       x = "Cluster", y = "Mean Expression Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(face = "bold", size = 14),
        legend.position = "right")

print(p_bar)
ggsave("Output_Figures/Sezary_CategoryScores_Barplot.pdf", p_bar, width = 14, height = 6)

# ============================================
# 7. TRAJECTORY ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY INFERENCE RECOMMENDATION ===\n")

trajectory_guide <- category_scores_wide %>%
  mutate(
    Stemness_Score = early_memory,
    Differentiation_Score = (exhausted + malignant) / 2,
    Balance = Stemness_Score - Differentiation_Score
  ) %>%
  arrange(desc(Balance))

cat("\nClusters ranked by Stemness Score:\n")
print(trajectory_guide %>% select(Cluster, Stemness_Score, Differentiation_Score, Balance))

cat("\n✓✓✓ RECOMMENDED ROOT CLUSTER: ", 
    trajectory_guide$Cluster[1], 
    " (highest stemness, lowest differentiation)\n\n", sep = "")

cat("✓ POTENTIAL TERMINAL CLUSTERS: ", 
    paste(tail(trajectory_guide$Cluster, 3), collapse = ", "),
    " (high exhaustion/malignancy)\n\n", sep = "")

# Save summary tables
write.csv(category_scores_wide, "Output_Figures/Cluster_Phenotype_Scores.csv", row.names = FALSE)
write.csv(trajectory_guide, "Output_Figures/Trajectory_Root_Recommendations.csv", row.names = FALSE)

cat("✓ Analysis complete! Tables saved to Output_Figures/\n")


```


# ROBUST Entropy Analysis (No External Dependencies)
```{r , fig.height=16, fig.width=16}
library(Seurat)
library(ggplot2)
library(dplyr)
library(patchwork)

DefaultAssay(seurat_obj) <- "SCT"

cat("Computing entropy-based stemness metrics...\n")

# ============================================
# Function: Shannon Entropy Calculation
# ============================================
calculate_shannon_entropy <- function(expression_vector) {
  expr_nonzero <- expression_vector[expression_vector > 0]
  if(length(expr_nonzero) == 0) return(0)
  
  probs <- expr_nonzero / sum(expr_nonzero)
  entropy <- -sum(probs * log2(probs))
  return(entropy)
}

# ============================================
# Function: Gini Coefficient (Specialization)
# ============================================
calculate_gini <- function(x) {
  x <- x[x > 0]
  if(length(x) == 0) return(0)
  
  n <- length(x)
  x_sorted <- sort(x)
  gini <- (2 * sum((1:n) * x_sorted)) / (n * sum(x_sorted)) - (n + 1) / n
  return(gini)
}

# ============================================
# STEP 1: Calculate Entropy Per Cell
# ============================================
expr_matrix <- GetAssayData(seurat_obj, assay = "SCT", layer = "data")
hvgs <- VariableFeatures(seurat_obj)
expr_hvg <- expr_matrix[hvgs, ]

cat("Calculating entropy for", ncol(expr_hvg), "cells using", length(hvgs), "HVGs...\n")

# Shannon Entropy
entropy_scores <- apply(expr_hvg, 2, calculate_shannon_entropy)
seurat_obj$Shannon_Entropy <- entropy_scores

# Gini Index
gini_scores <- apply(expr_hvg, 2, calculate_gini)
seurat_obj$Gini_Index <- gini_scores

# Transcriptional Diversity Score
seurat_obj$Diversity_Score <- seurat_obj$nFeature_RNA / max(seurat_obj$nFeature_RNA)

# ============================================
# STEP 2: Combined Stemness Score
# ============================================
# Normalize and combine metrics
seurat_obj$Stemness_Score <- 
  scales::rescale(seurat_obj$Shannon_Entropy, to = c(0, 1)) * 0.4 +
  (1 - scales::rescale(seurat_obj$Gini_Index, to = c(0, 1))) * 0.3 +  # Invert Gini
  seurat_obj$Diversity_Score * 0.3

# ============================================
# STEP 3: Per-Cluster Analysis
# ============================================
cluster_potency <- seurat_obj@meta.data %>%
  group_by(seurat_clusters) %>%
  summarize(
    n_cells = n(),
    mean_entropy = mean(Shannon_Entropy),
    mean_gini = mean(Gini_Index),
    mean_diversity = mean(Diversity_Score),
    mean_stemness = mean(Stemness_Score),
    sd_stemness = sd(Stemness_Score)
  ) %>%
  arrange(desc(mean_stemness))

cat("\n=== STEMNESS/POTENCY SCORES BY CLUSTER ===\n")
print(cluster_potency)

# ============================================
# VISUALIZATION 1: Stemness Score Barplot
# ============================================
p1 <- ggplot(cluster_potency, 
             aes(x = reorder(seurat_clusters, -mean_stemness), 
                 y = mean_stemness,
                 fill = mean_stemness)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = mean_stemness - sd_stemness,
                    ymax = mean_stemness + sd_stemness),
                width = 0.3, alpha = 0.5) +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(title = "Entropy-Based Stemness Score by Cluster",
       subtitle = "Combined metric: Shannon entropy + Gini + Diversity",
       x = "Cluster", y = "Mean Stemness Score",
       fill = "Stemness") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14))

print(p1)

# ============================================
# VISUALIZATION 2: Stemness on UMAP
# ============================================
p2 <- FeaturePlot(seurat_obj, 
                  features = "Stemness_Score",
                  reduction = "umap",
                  pt.size = 1.5) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Stemness Score in UMAP Space",
       subtitle = "Red = High stemness/potency")

print(p2)

# ============================================
# VISUALIZATION 3: Component Scores Heatmap
# ============================================
library(reshape2)
heatmap_data <- cluster_potency %>%
  select(seurat_clusters, mean_entropy, mean_gini, mean_diversity) %>%
  mutate(mean_gini = 1 - mean_gini) %>%  # Invert for consistency
  mutate(across(c(mean_entropy, mean_gini, mean_diversity), 
                ~scales::rescale(.x, to = c(0, 1)))) %>%
  melt(id.vars = "seurat_clusters")

p3 <- ggplot(heatmap_data, aes(x = variable, y = seurat_clusters, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                       midpoint = 0.5) +
  labs(title = "Component Metrics of Stemness",
       x = "Metric", y = "Cluster", fill = "Normalized\nScore") +
  scale_x_discrete(labels = c("Entropy", "Specialization", "Diversity")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p3)

# ============================================
# VISUALIZATION 4: Violin Plot
# ============================================
p4 <- VlnPlot(seurat_obj, 
              features = "Stemness_Score",
              pt.size = 0) +
  labs(title = "Stemness Score Distribution",
       x = "Cluster", y = "Stemness Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p4)

# ============================================
# INTEGRATE WITH SÉZARY MARKERS
# ============================================
sezary_markers <- list(
  early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
  exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3")
)

seurat_obj <- AddModuleScore(seurat_obj, 
                             features = sezary_markers,
                             name = c("EarlyMemory", "Exhausted"))

cor_memory <- cor(seurat_obj$Stemness_Score, seurat_obj$EarlyMemory1, use = "complete.obs")
cor_exhausted <- cor(seurat_obj$Stemness_Score, seurat_obj$Exhausted2, use = "complete.obs")

cat("\n=== VALIDATION AGAINST MARKER GENES ===\n")
cat("Correlation Stemness vs Early Memory:", round(cor_memory, 3), "\n")
cat("Correlation Stemness vs Exhaustion:", round(cor_exhausted, 3), "\n")

p5 <- ggplot(seurat_obj@meta.data, 
             aes(x = Stemness_Score, y = EarlyMemory1, color = seurat_clusters)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "black") +
  labs(title = "Stemness Score vs Early Memory Markers",
       subtitle = paste0("r = ", round(cor_memory, 3)),
       x = "Stemness Score", y = "Early Memory Score") +
  theme_minimal()

print(p5)

# ============================================
# ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY ROOT RECOMMENDATION ===\n\n")
cat("✓✓✓ HIGHEST STEMNESS/POTENCY CLUSTER: ", cluster_potency$seurat_clusters, "\n")[1]
cat("     Stemness Score: ", round(cluster_potency$mean_stemness, 3), "\n\n")[1]

cat("✓ Top 3 stem-like clusters:\n")
print(cluster_potency %>% head(3) %>% select(seurat_clusters, mean_stemness))

cat("\n✓ Top 3 differentiated clusters:\n")
print(cluster_potency %>% tail(3) %>% select(seurat_clusters, mean_stemness))

# ============================================
# SAVE RESULTS
# ============================================
write.csv(cluster_potency, 
          "Output_Figures/Stemness_Potency_ByCluster.csv", 
          row.names = FALSE)

# Combined figure
p_combined <- (p1 | p2) / (p3 | p4) + 
  plot_annotation(title = "Entropy-Based Stemness Analysis",
                  subtitle = "Root-free differentiation state assessment",
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

print(p_combined)
ggsave("Output_Figures/Stemness_Analysis_Complete.pdf", p_combined, 
       width = 16, height = 12)

cat("\n✓ Analysis complete! All results saved to Output_Figures/\n")
```




```{r , fig.height=16, fig.width=16}


```


