1 load libraries

# Data Processing
library(dplyr)
library(Seurat)
library(tibble)
library(tidyr)
library(stringr)

# Visualization
library(ggplot2)
library(ComplexHeatmap)
library(patchwork)
library(SCpubr)

# Regulatory Network Inference
library(decoupleR)
library(dorothea)
data(dorothea_hs, package = "dorothea")
library(tictoc)

2 Load Seurat Object

library(Seurat)
library(reticulate)
library(ggplot2)

# 1. Load Object
seurat_obj <- readRDS("temp_seurat_obj.rds")

3 SUPPLEMENTARY FIGURE: PAGA Trajectory Analysis (via Reticulate/Scanpy)

# Display in notebook
if(file.exists(output_file)){
  knitr::include_graphics(output_file)
}

4 SUPPLEMENTARY FIGURE: PAGA on Gene Expression (SCT Assay)

5 SUPPLEMENTARY FIGURE: Quick Code to Generate Malignant-Only PAGA

6 SUPPLEMENTARY FIGURE: UMAP Biplot Code (Correlation-Based Approach)

library(Seurat)
library(ggplot2)
library(dplyr)
library(ggrepel)

# 1. Extract UMAP Embeddings
DefaultAssay(seurat_obj) <- "dorothea"
umap_data <- Embeddings(seurat_obj, reduction = "umap")[, 1:2] %>% 
  as.data.frame() %>% 
  mutate(Cluster = as.factor(seurat_obj$seurat_clusters))

# 2. Get TF Activity Matrix
tf_matrix <- GetAssayData(seurat_obj, assay = "dorothea", layer = "data")

# 3. Compute TF Correlations with UMAP Axes
tf_cors <- data.frame(
  TF = rownames(tf_matrix),
  UMAP_1_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_1)),
  UMAP_2_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_2))
)

# 4. Filter Top Drivers (Top 3 per quadrant)
umap1_top <- tf_cors %>% arrange(desc(UMAP_1_cor)) %>% head(3) %>% pull(TF)
umap1_bottom <- tf_cors %>% arrange(UMAP_1_cor) %>% head(3) %>% pull(TF)
umap2_top <- tf_cors %>% arrange(desc(UMAP_2_cor)) %>% head(3) %>% pull(TF)
umap2_bottom <- tf_cors %>% arrange(UMAP_2_cor) %>% head(3) %>% pull(TF)

top_tf_drivers <- unique(c(umap1_top, umap1_bottom, umap2_top, umap2_bottom))

# 5. Prepare Arrow Data
arrow_data <- tf_cors %>% filter(TF %in% top_tf_drivers)

# Scale arrows to fit nicely on the plot
scale_factor <- max(abs(umap_data$umap_1)) / max(abs(arrow_data$UMAP_1_cor)) * 0.7
arrow_data$UMAP_1 <- arrow_data$UMAP_1_cor * scale_factor
arrow_data$UMAP_2 <- arrow_data$UMAP_2_cor * scale_factor

# 6. Calculate Cluster Centroids for Labels
cluster_centroids <- umap_data %>%
  group_by(Cluster) %>%
  summarise(
    umap_1_center = median(umap_1),
    umap_2_center = median(umap_2)
  )

# 7. Plot UMAP Biplot with Cluster Labels
p_umap_biplot <- ggplot(umap_data, aes(x = umap_1, y = umap_2)) +
  # Points (Cells) - larger and more visible
  geom_point(aes(color = Cluster), alpha = 0.7, size = 2) +
  
  # Cluster Labels (at centroids)
  geom_text(data = cluster_centroids, 
            aes(x = umap_1_center, y = umap_2_center, label = Cluster),
            fontface = "bold", size = 6, color = "black") +
  
  # Arrows (TFs) - from center of UMAP
  geom_segment(data = arrow_data, 
               aes(x = mean(umap_data$umap_1), y = mean(umap_data$umap_2), 
                   xend = mean(umap_data$umap_1) + UMAP_1, 
                   yend = mean(umap_data$umap_2) + UMAP_2),
               arrow = arrow(length = unit(0.25, "cm")), 
               color = "darkred", linewidth = 1) +
  
  # TF Labels (using ggrepel to avoid overlap with cluster labels)
  geom_text_repel(data = arrow_data, 
                  aes(x = mean(umap_data$umap_1) + UMAP_1, 
                      y = mean(umap_data$umap_2) + UMAP_2, 
                      label = TF),
                  fontface = "bold.italic", color = "darkred", size = 4.5, 
                  box.padding = 0.6, point.padding = 0.4,
                  min.segment.length = 0.2) +
  
  # Styling
  scale_color_manual(values = Seurat::DiscretePalette(14)) +
  labs(title = "Regulatory Drivers in UMAP Space",
       subtitle = "Cluster numbers shown at centroids; TF arrows indicate regulatory associations",
       x = "UMAP_1", 
       y = "UMAP_2") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right",
    panel.grid.minor = element_blank()
  )

# Save
ggsave("Output_Figures/Supplementary_TF_UMAP_Biplot.pdf", p_umap_biplot, width = 12, height = 9)
print(p_umap_biplot)

7 Check marker expression to validate root choice

library(Seurat)
library(ggplot2)
library(dplyr)
library(pheatmap)

# Make sure SCT is default
DefaultAssay(seurat_obj) <- "SCT"

# SƩzary-specific markers
sezary_markers <- list(
  early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
  malignant = c("TOX", "TWIST1", "DNM3", "STAT5A"),
  exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3"),
  proliferative = c("MKI67", "TOP2A", "PCNA")
)

all_markers <- unlist(sezary_markers)

# ============================================
# 1. DOT PLOT - ALL CLUSTERS
# ============================================
p_dot <- DotPlot(seurat_obj, 
                 features = sezary_markers,
                 assay = "SCT",  # Explicitly specify assay
                 cols = c("lightgrey", "red"),
                 dot.scale = 6) +
  coord_flip() +
  ggtitle("SƩzary Markers Across All Clusters") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
        axis.text.y = element_text(size = 10)) +
  labs(x = "Markers", y = "Clusters")

print(p_dot)
ggsave("Output_Figures/Sezary_Markers_DotPlot_AllClusters.pdf", p_dot, width = 14, height = 8)

# ============================================
# 2. AVERAGE EXPRESSION (FIXED)
# ============================================
# Use AggregateExpression (recommended in Seurat v5) with correct assay
avg_exp <- AggregateExpression(seurat_obj, 
                                features = all_markers,
                                assays = "SCT",  # Specify SCT only
                                group.by = "seurat_clusters",
                                slot = "data",
                                return.seurat = FALSE)

# Extract the SCT matrix
expr_matrix <- as.matrix(avg_exp$SCT)

# Check dimensions
cat("Expression matrix dimensions:", nrow(expr_matrix), "genes x", ncol(expr_matrix), "clusters\n")
Expression matrix dimensions: 15 genes x 14 clusters
print(head(expr_matrix))
          g0    g1    g2    g3    g4   g5   g6   g7   g8   g9  g10  g11  g12  g13
CCR7   14443  1144 11455 17619 12865 5533 9463 4391 7776 5016 8035 3217 4113 8053
SELL    1558   145   657 17789   753 3982  277 2210 1287 2874 9643 1256  409  115
TCF7     125  3808   341 19510    52 3758  402 1011   50 2130 8873  368  207  114
LEF1     687  9412   761 14007   169 9625  771 1806  206 4801 5217  962  326   68
TOX     2074 14642   984   132   784 2792  373 1300 2539 1715  491  675  411  157
TWIST1  3668  3259  2324     0  3893  212 1720 1276 2535  559   28  674  466  496
# ============================================
# 3. HEATMAP - Scaled Expression
# ============================================
# Scale by row (gene) for better visualization
expr_matrix_scaled <- t(scale(t(expr_matrix)))

# Annotation for marker categories
annotation_row <- data.frame(
  Category = rep(names(sezary_markers), times = sapply(sezary_markers, length)),
  row.names = all_markers
)

annotation_colors <- list(
  Category = c(early_memory = "#4DAF4A", 
               malignant = "#E41A1C",
               exhausted = "#377EB8",
               proliferative = "#FF7F00")
)

pdf("Output_Figures/Sezary_Markers_Heatmap_AllClusters.pdf", width = 12, height = 8)

pheatmap(expr_matrix_scaled,
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         annotation_row = annotation_row,
         annotation_colors = annotation_colors,
         color = colorRampPalette(c("navy", "white", "firebrick"))(100),
         main = "SƩzary Markers - All Clusters (Scaled)",
         fontsize = 10,
         angle_col = 0)
dev.off()
pdf 
  3 

# ============================================
# 4. CALCULATE CATEGORY SCORES PER CLUSTER
# ============================================
category_scores <- data.frame()

for(category in names(sezary_markers)) {
  markers_in_category <- sezary_markers[[category]]
  
  # Calculate mean expression per cluster for this category
  category_mean <- colMeans(expr_matrix[markers_in_category, , drop = FALSE])
  
  temp_df <- data.frame(
    Cluster = names(category_mean),
    Category = category,
    Score = as.numeric(category_mean)
  )
  
  category_scores <- rbind(category_scores, temp_df)
}

# Reshape for easier analysis
library(tidyr)
category_scores_wide <- category_scores %>%
  pivot_wider(names_from = Category, values_from = Score)

# Remove 'g' prefix if present (Seurat adds this)
category_scores_wide$Cluster <- gsub("^g", "", category_scores_wide$Cluster)

cat("\n=== CATEGORY SCORES BY CLUSTER ===\n")

=== CATEGORY SCORES BY CLUSTER ===
print(category_scores_wide)
# A tibble: 14 Ɨ 5
   Cluster early_memory malignant exhausted proliferative
   <chr>          <dbl>     <dbl>     <dbl>         <dbl>
 1 0              4203.     2576.      382.        16194.
 2 1              3627.     4727.      101.        21402.
 3 2              3304.     1559      3609.         5791.
 4 3             17231.      397.      146.          215.
 5 4              3460.     1658.      133         10292 
 6 5              5724.     1007.      173          6241 
 7 6              2728.     1211.     2705.         8700.
 8 7              2354.     1136.      306.        15908.
 9 8              2330.     1832.     4309         12281.
10 9              3705.      720.      220          2966 
11 10             7942       390.      442.          272.
12 11             1451.      510.      575          1660.
13 12             1264.      608.      388.         2580.
14 13             2088.      354.      265.         1797.
# ============================================
# 5. IDENTIFY CLUSTER PHENOTYPES
# ============================================
cat("\n=== CLUSTER PHENOTYPE CLASSIFICATION ===\n")

=== CLUSTER PHENOTYPE CLASSIFICATION ===
# Rank clusters by each category
rankings <- category_scores_wide %>%
  mutate(
    early_memory_rank = rank(-early_memory),
    malignant_rank = rank(-malignant),
    exhausted_rank = rank(-exhausted),
    proliferative_rank = rank(-proliferative)
  )

# Identify top 3 clusters per category
cat("\nāœ“ Top 3 STEM-LIKE clusters (Early Memory):\n")

āœ“ Top 3 STEM-LIKE clusters (Early Memory):
top_memory <- rankings %>% arrange(early_memory_rank) %>% head(3) %>% 
  select(Cluster, early_memory, early_memory_rank)
print(top_memory)
# A tibble: 3 Ɨ 3
  Cluster early_memory early_memory_rank
  <chr>          <dbl>             <dbl>
1 3             17231.                 1
2 10             7942                  2
3 5              5724.                 3
cat("\nāœ“ Top 3 MALIGNANT clusters:\n")

āœ“ Top 3 MALIGNANT clusters:
top_malignant <- rankings %>% arrange(malignant_rank) %>% head(3) %>% 
  select(Cluster, malignant, malignant_rank)
print(top_malignant)
# A tibble: 3 Ɨ 3
  Cluster malignant malignant_rank
  <chr>       <dbl>          <dbl>
1 1           4727.              1
2 0           2576.              2
3 8           1832.              3
cat("\nāœ“ Top 3 EXHAUSTED/TERMINAL clusters:\n")

āœ“ Top 3 EXHAUSTED/TERMINAL clusters:
top_exhausted <- rankings %>% arrange(exhausted_rank) %>% head(3) %>% 
  select(Cluster, exhausted, exhausted_rank)
print(top_exhausted)
# A tibble: 3 Ɨ 3
  Cluster exhausted exhausted_rank
  <chr>       <dbl>          <dbl>
1 8           4309               1
2 2           3609.              2
3 6           2705.              3
cat("\nāœ“ Top 3 PROLIFERATIVE clusters:\n")

āœ“ Top 3 PROLIFERATIVE clusters:
top_prolif <- rankings %>% arrange(proliferative_rank) %>% head(3) %>% 
  select(Cluster, proliferative, proliferative_rank)
print(top_prolif)
# A tibble: 3 Ɨ 3
  Cluster proliferative proliferative_rank
  <chr>           <dbl>              <dbl>
1 1              21402.                  1
2 0              16194.                  2
3 7              15908.                  3
# ============================================
# 6. BARPLOT - Category Scores
# ============================================
p_bar <- ggplot(category_scores, aes(x = gsub("^g", "", Cluster), y = Score, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c(early_memory = "#4DAF4A", 
                                malignant = "#E41A1C",
                                exhausted = "#377EB8",
                                proliferative = "#FF7F00")) +
  labs(title = "Functional Signatures Across All Clusters",
       x = "Cluster", y = "Mean Expression Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(face = "bold", size = 14),
        legend.position = "right")

print(p_bar)
ggsave("Output_Figures/Sezary_CategoryScores_Barplot.pdf", p_bar, width = 14, height = 6)

# ============================================
# 7. TRAJECTORY ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY INFERENCE RECOMMENDATION ===\n")

=== TRAJECTORY INFERENCE RECOMMENDATION ===
trajectory_guide <- category_scores_wide %>%
  mutate(
    Stemness_Score = early_memory,
    Differentiation_Score = (exhausted + malignant) / 2,
    Balance = Stemness_Score - Differentiation_Score
  ) %>%
  arrange(desc(Balance))

cat("\nClusters ranked by Stemness Score:\n")

Clusters ranked by Stemness Score:
print(trajectory_guide %>% select(Cluster, Stemness_Score, Differentiation_Score, Balance))
# A tibble: 14 Ɨ 4
   Cluster Stemness_Score Differentiation_Score Balance
   <chr>            <dbl>                 <dbl>   <dbl>
 1 3               17231.                  271.  16960 
 2 10               7942                   416    7526 
 3 5                5724.                  590.   5135.
 4 9                3705.                  470.   3236.
 5 0                4203.                 1479.   2724.
 6 4                3460.                  895.   2564.
 7 13               2088.                  310.   1778.
 8 7                2354.                  721.   1634.
 9 1                3627.                 2414    1213.
10 11               1451.                  543.    908.
11 6                2728.                 1958.    770 
12 12               1264.                  498.    766.
13 2                3304.                 2584.    720.
14 8                2330.                 3070.   -740.
cat("\nāœ“āœ“āœ“ RECOMMENDED ROOT CLUSTER: ", 
    trajectory_guide$Cluster[1], 
    " (highest stemness, lowest differentiation)\n\n", sep = "")

āœ“āœ“āœ“ RECOMMENDED ROOT CLUSTER: 3 (highest stemness, lowest differentiation)
cat("āœ“ POTENTIAL TERMINAL CLUSTERS: ", 
    paste(tail(trajectory_guide$Cluster, 3), collapse = ", "),
    " (high exhaustion/malignancy)\n\n", sep = "")
āœ“ POTENTIAL TERMINAL CLUSTERS: 12, 2, 8 (high exhaustion/malignancy)
# Save summary tables
write.csv(category_scores_wide, "Output_Figures/Cluster_Phenotype_Scores.csv", row.names = FALSE)
write.csv(trajectory_guide, "Output_Figures/Trajectory_Root_Recommendations.csv", row.names = FALSE)

cat("āœ“ Analysis complete! Tables saved to Output_Figures/\n")
āœ“ Analysis complete! Tables saved to Output_Figures/

8 ROBUST Entropy Analysis (No External Dependencies)

library(Seurat)
library(ggplot2)
library(dplyr)
library(patchwork)

DefaultAssay(seurat_obj) <- "SCT"

cat("Computing entropy-based stemness metrics...\n")
Computing entropy-based stemness metrics...
# ============================================
# Function: Shannon Entropy Calculation
# ============================================
calculate_shannon_entropy <- function(expression_vector) {
  expr_nonzero <- expression_vector[expression_vector > 0]
  if(length(expr_nonzero) == 0) return(0)
  
  probs <- expr_nonzero / sum(expr_nonzero)
  entropy <- -sum(probs * log2(probs))
  return(entropy)
}

# ============================================
# Function: Gini Coefficient (Specialization)
# ============================================
calculate_gini <- function(x) {
  x <- x[x > 0]
  if(length(x) == 0) return(0)
  
  n <- length(x)
  x_sorted <- sort(x)
  gini <- (2 * sum((1:n) * x_sorted)) / (n * sum(x_sorted)) - (n + 1) / n
  return(gini)
}

# ============================================
# STEP 1: Calculate Entropy Per Cell
# ============================================
expr_matrix <- GetAssayData(seurat_obj, assay = "SCT", layer = "data")
hvgs <- VariableFeatures(seurat_obj)
expr_hvg <- expr_matrix[hvgs, ]

cat("Calculating entropy for", ncol(expr_hvg), "cells using", length(hvgs), "HVGs...\n")
Calculating entropy for 49305 cells using 3000 HVGs...
# Shannon Entropy
entropy_scores <- apply(expr_hvg, 2, calculate_shannon_entropy)
seurat_obj$Shannon_Entropy <- entropy_scores

# Gini Index
gini_scores <- apply(expr_hvg, 2, calculate_gini)
seurat_obj$Gini_Index <- gini_scores

# Transcriptional Diversity Score
seurat_obj$Diversity_Score <- seurat_obj$nFeature_RNA / max(seurat_obj$nFeature_RNA)

# ============================================
# STEP 2: Combined Stemness Score
# ============================================
# Normalize and combine metrics
seurat_obj$Stemness_Score <- 
  scales::rescale(seurat_obj$Shannon_Entropy, to = c(0, 1)) * 0.4 +
  (1 - scales::rescale(seurat_obj$Gini_Index, to = c(0, 1))) * 0.3 +  # Invert Gini
  seurat_obj$Diversity_Score * 0.3

# ============================================
# STEP 3: Per-Cluster Analysis
# ============================================
cluster_potency <- seurat_obj@meta.data %>%
  group_by(seurat_clusters) %>%
  summarize(
    n_cells = n(),
    mean_entropy = mean(Shannon_Entropy),
    mean_gini = mean(Gini_Index),
    mean_diversity = mean(Diversity_Score),
    mean_stemness = mean(Stemness_Score),
    sd_stemness = sd(Stemness_Score)
  ) %>%
  arrange(desc(mean_stemness))

cat("\n=== STEMNESS/POTENCY SCORES BY CLUSTER ===\n")

=== STEMNESS/POTENCY SCORES BY CLUSTER ===
print(cluster_potency)
# A tibble: 14 Ɨ 7
   seurat_clusters n_cells mean_entropy mean_gini mean_diversity mean_stemness sd_stemness
   <fct>             <int>        <dbl>     <dbl>          <dbl>         <dbl>       <dbl>
 1 8                  3338         9.92     0.318          0.654         0.587      0.0610
 2 12                 1063         9.74     0.317          0.586         0.536      0.0958
 3 1                  5275         9.75     0.324          0.609         0.535      0.0642
 4 13                  691         9.70     0.319          0.524         0.508      0.0750
 5 6                  3536         9.67     0.316          0.498         0.498      0.0637
 6 0                  6789         9.66     0.323          0.529         0.496      0.0651
 7 7                  3409         9.67     0.312          0.472         0.495      0.0693
 8 4                  4086         9.66     0.328          0.540         0.494      0.0668
 9 5                  3634         9.59     0.311          0.480         0.485      0.0824
10 2                  4663         9.56     0.323          0.467         0.459      0.0709
11 11                 1675         9.55     0.306          0.352         0.446      0.0692
12 9                  3273         9.33     0.332          0.356         0.375      0.0544
13 10                 3212         9.15     0.291          0.281         0.374      0.0785
14 3                  4661         8.98     0.304          0.247         0.319      0.0669
# ============================================
# VISUALIZATION 1: Stemness Score Barplot
# ============================================
p1 <- ggplot(cluster_potency, 
             aes(x = reorder(seurat_clusters, -mean_stemness), 
                 y = mean_stemness,
                 fill = mean_stemness)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = mean_stemness - sd_stemness,
                    ymax = mean_stemness + sd_stemness),
                width = 0.3, alpha = 0.5) +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(title = "Entropy-Based Stemness Score by Cluster",
       subtitle = "Combined metric: Shannon entropy + Gini + Diversity",
       x = "Cluster", y = "Mean Stemness Score",
       fill = "Stemness") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14))

print(p1)


# ============================================
# VISUALIZATION 2: Stemness on UMAP
# ============================================
p2 <- FeaturePlot(seurat_obj, 
                  features = "Stemness_Score",
                  reduction = "umap",
                  pt.size = 1.5) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Stemness Score in UMAP Space",
       subtitle = "Red = High stemness/potency")

print(p2)


# ============================================
# VISUALIZATION 3: Component Scores Heatmap
# ============================================
library(reshape2)
heatmap_data <- cluster_potency %>%
  select(seurat_clusters, mean_entropy, mean_gini, mean_diversity) %>%
  mutate(mean_gini = 1 - mean_gini) %>%  # Invert for consistency
  mutate(across(c(mean_entropy, mean_gini, mean_diversity), 
                ~scales::rescale(.x, to = c(0, 1)))) %>%
  melt(id.vars = "seurat_clusters")

p3 <- ggplot(heatmap_data, aes(x = variable, y = seurat_clusters, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                       midpoint = 0.5) +
  labs(title = "Component Metrics of Stemness",
       x = "Metric", y = "Cluster", fill = "Normalized\nScore") +
  scale_x_discrete(labels = c("Entropy", "Specialization", "Diversity")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p3)


# ============================================
# VISUALIZATION 4: Violin Plot
# ============================================
p4 <- VlnPlot(seurat_obj, 
              features = "Stemness_Score",
              pt.size = 0) +
  labs(title = "Stemness Score Distribution",
       x = "Cluster", y = "Stemness Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p4)


# ============================================
# INTEGRATE WITH SƉZARY MARKERS
# ============================================
sezary_markers <- list(
  early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
  exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3")
)

seurat_obj <- AddModuleScore(seurat_obj, 
                             features = sezary_markers,
                             name = c("EarlyMemory", "Exhausted"))

cor_memory <- cor(seurat_obj$Stemness_Score, seurat_obj$EarlyMemory1, use = "complete.obs")
cor_exhausted <- cor(seurat_obj$Stemness_Score, seurat_obj$Exhausted2, use = "complete.obs")

cat("\n=== VALIDATION AGAINST MARKER GENES ===\n")

=== VALIDATION AGAINST MARKER GENES ===
cat("Correlation Stemness vs Early Memory:", round(cor_memory, 3), "\n")
Correlation Stemness vs Early Memory: -0.583 
cat("Correlation Stemness vs Exhaustion:", round(cor_exhausted, 3), "\n")
Correlation Stemness vs Exhaustion: 0.063 
p5 <- ggplot(seurat_obj@meta.data, 
             aes(x = Stemness_Score, y = EarlyMemory1, color = seurat_clusters)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "black") +
  labs(title = "Stemness Score vs Early Memory Markers",
       subtitle = paste0("r = ", round(cor_memory, 3)),
       x = "Stemness Score", y = "Early Memory Score") +
  theme_minimal()

print(p5)


# ============================================
# ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY ROOT RECOMMENDATION ===\n\n")

=== TRAJECTORY ROOT RECOMMENDATION ===
cat("āœ“āœ“āœ“ HIGHEST STEMNESS/POTENCY CLUSTER: ", cluster_potency$seurat_clusters, "\n")[1]
āœ“āœ“āœ“ HIGHEST STEMNESS/POTENCY CLUSTER:  9 13 2 14 7 1 8 5 6 3 12 10 11 4 
NULL
cat("     Stemness Score: ", round(cluster_potency$mean_stemness, 3), "\n\n")[1]
     Stemness Score:  0.587 0.536 0.535 0.508 0.498 0.496 0.495 0.494 0.485 0.459 0.446 0.375 0.374 0.319 

NULL
cat("āœ“ Top 3 stem-like clusters:\n")
āœ“ Top 3 stem-like clusters:
print(cluster_potency %>% head(3) %>% select(seurat_clusters, mean_stemness))
# A tibble: 3 Ɨ 2
  seurat_clusters mean_stemness
  <fct>                   <dbl>
1 8                       0.587
2 12                      0.536
3 1                       0.535
cat("\nāœ“ Top 3 differentiated clusters:\n")

āœ“ Top 3 differentiated clusters:
print(cluster_potency %>% tail(3) %>% select(seurat_clusters, mean_stemness))
# A tibble: 3 Ɨ 2
  seurat_clusters mean_stemness
  <fct>                   <dbl>
1 9                       0.375
2 10                      0.374
3 3                       0.319
# ============================================
# SAVE RESULTS
# ============================================
write.csv(cluster_potency, 
          "Output_Figures/Stemness_Potency_ByCluster.csv", 
          row.names = FALSE)

# Combined figure
p_combined <- (p1 | p2) / (p3 | p4) + 
  plot_annotation(title = "Entropy-Based Stemness Analysis",
                  subtitle = "Root-free differentiation state assessment",
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

print(p_combined)

ggsave("Output_Figures/Stemness_Analysis_Complete.pdf", p_combined, 
       width = 16, height = 12)

cat("\nāœ“ Analysis complete! All results saved to Output_Figures/\n")

āœ“ Analysis complete! All results saved to Output_Figures/
---
title: "PAGA using TF Activity"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# load libraries
```{r setup, include=TRUE}
# Data Processing
library(dplyr)
library(Seurat)
library(tibble)
library(tidyr)
library(stringr)

# Visualization
library(ggplot2)
library(ComplexHeatmap)
library(patchwork)
library(SCpubr)

# Regulatory Network Inference
library(decoupleR)
library(dorothea)
data(dorothea_hs, package = "dorothea")
library(tictoc)


```

# Load Seurat Object 
```{r}
library(Seurat)
library(reticulate)
library(ggplot2)

# 1. Load Object
seurat_obj <- readRDS("temp_seurat_obj.rds")



```

# SUPPLEMENTARY FIGURE: PAGA Trajectory Analysis (via Reticulate/Scanpy)
```{r, fig.height=16, fig.width=16}
library(Seurat)
library(reticulate)
library(ggplot2)

# 1. FORCE USE of the specific python binary where scanpy is installed
# This overrides the default "auto" discovery which is failing you
Sys.setenv(RETICULATE_PYTHON = "/home/bioinfo/.virtualenvs/r-reticulate/bin/python")
use_python("/home/bioinfo/.virtualenvs/r-reticulate/bin/python", required = TRUE)

# 2. Verify it's working (Optional but good for debugging)
cat("Using Python:", py_config()$python, "\n")

# 3. Import Libraries
sc <- import("scanpy")
ad <- import("anndata")

cat("✓ Python libraries imported successfully!\n")

# 4. Prepare Data for PAGA
cat("Converting Seurat object to AnnData...\n")
tf_matrix <- t(GetAssayData(seurat_obj, assay = "dorothea", layer = "data"))
obs <- data.frame(row.names = colnames(seurat_obj), 
                  louvain = as.character(seurat_obj$seurat_clusters))

adata <- ad$AnnData(X = tf_matrix, obs = obs)

# 5. Run Scanpy Workflow
cat("Running PAGA...\n")
sc$pp$neighbors(adata, use_rep = "X", n_neighbors = 15L)
sc$tl$paga(adata, groups = "louvain")

# 6. Plot and Save
output_file <- "Output_Figures/Supplementary_PAGA_TF_Activity.png"
if(!dir.exists("Output_Figures")) dir.create("Output_Figures")

# CORRECT ORDER: Draw PAGA first to compute positions
sc$pl$paga(adata, threshold = 0.15, show = FALSE)

# Now UMAP can use those positions
sc$tl$umap(adata, init_pos = "paga")

# Plot PAGA with connectivity threshold
sc$pl$paga(adata, threshold = 0.15, show = FALSE, save = "_TF_Activity.png")

# Scanpy saves to "figures/" by default
if(file.exists("figures/paga_TF_Activity.png")){
  file.rename("figures/paga_TF_Activity.png", output_file)
  cat(sprintf("✓ PAGA plot saved to: %s\n", output_file))
} else {
  cat("Note: Check './figures/' directory for output\n")
}

# Display in notebook
if(file.exists(output_file)){
  knitr::include_graphics(output_file)
}

```

# SUPPLEMENTARY FIGURE: PAGA on Gene Expression (SCT Assay)
```{r, fig.height=16, fig.width=16}
library(Seurat)
library(reticulate)

# Use same python environment
Sys.setenv(RETICULATE_PYTHON = "/home/bioinfo/.virtualenvs/r-reticulate/bin/python")
use_python("/home/bioinfo/.virtualenvs/r-reticulate/bin/python", required = TRUE)

sc <- import("scanpy")
ad <- import("anndata")

cat("✓ Python libraries imported\n")

# 1. Extract SCT-normalized gene expression
DefaultAssay(seurat_obj) <- "SCT"

# Use highly variable features from SCT
hvgs <- VariableFeatures(seurat_obj)

# Get normalized data (log1p of corrected counts)
gene_matrix <- t(GetAssayData(seurat_obj, assay = "SCT", layer = "data")[hvgs, ])

cat(sprintf("Using %d highly variable genes from SCT assay\n", length(hvgs)))

# 2. Prepare AnnData object
obs <- data.frame(
  row.names = colnames(seurat_obj),
  louvain = as.character(seurat_obj$seurat_clusters)
)

adata_sct <- ad$AnnData(X = gene_matrix, obs = obs)

# 3. Run PAGA workflow
cat("Computing neighbors and PAGA on SCT-normalized expression...\n")
sc$pp$neighbors(adata_sct, use_rep = "X", n_neighbors = 15L)
sc$tl$paga(adata_sct, groups = "louvain")

# 4. Plot and save
output_file_sct <- "Output_Figures/Supplementary_PAGA_GeneExpression_SCT.png"

# Draw PAGA first to compute positions
sc$pl$paga(adata_sct, threshold = 0.15, show = FALSE)

# Compute UMAP initialized by PAGA
sc$tl$umap(adata_sct, init_pos = "paga")

# Final PAGA plot
sc$pl$paga(adata_sct, threshold = 0.15, show = FALSE, save = "_GeneExpression_SCT.png")

# Move from figures/ to Output_Figures/
if(file.exists("figures/paga_GeneExpression_SCT.png")){
  file.rename("figures/paga_GeneExpression_SCT.png", output_file_sct)
  cat(sprintf("✓ SCT Gene Expression PAGA saved to: %s\n", output_file_sct))
}

# Display
if(file.exists(output_file_sct)){
  knitr::include_graphics(output_file_sct)
}
```
# SUPPLEMENTARY FIGURE: Quick Code to Generate Malignant-Only PAGA
```{r paga_analysis_SCT, fig.height=16, fig.width=16}
# Run PAGA on malignant cells only
library(reticulate)
library(Seurat)

# 1. Subset to malignant cells
malignant_clusters <- setdiff(unique(seurat_obj$seurat_clusters), c(3, 10))
seurat_malignant <- subset(seurat_obj, subset = seurat_clusters %in% malignant_clusters)

# 2. Prepare for PAGA
sc <- import("scanpy")
ad <- import("anndata")

# Get expression matrix and metadata (malignant only)
expr_matrix <- t(GetAssayData(seurat_malignant, assay = "SCT", layer = "data"))
obs <- data.frame(
  row.names = colnames(seurat_malignant),
  louvain = as.character(seurat_malignant$seurat_clusters)
)

adata_malignant <- ad$AnnData(X = expr_matrix, obs = obs)

# 3. Run PAGA
sc$pp$neighbors(adata_malignant, use_rep = "X", n_neighbors = 15L)
sc$tl$paga(adata_malignant, groups = "louvain")

# 4. Plot
sc$pl$paga(adata_malignant, threshold = 0.15, show = FALSE)

# Compute UMAP initialized by PAGA
sc$tl$umap(adata_malignant, init_pos = "paga")

# Save figure
plt <- import("matplotlib.pyplot")
sc$pl$paga(adata_malignant, threshold = 0.15, show = FALSE)  # Draw the plot
plt$savefig("Output_Figures/PAGA_Malignant_Only.png", dpi = 300L, bbox_inches = "tight")
plt$close()

cat("✓ Malignant-only PAGA saved to Output_Figures/PAGA_Malignant_Only.png\n")


output_file <- "Output_Figures/PAGA_Malignant_Only.png"

if (file.exists(output_file)) {
  knitr::include_graphics(output_file)
}

```

# SUPPLEMENTARY FIGURE: UMAP Biplot Code (Correlation-Based Approach)
```{r , fig.height=16, fig.width=16}
library(Seurat)
library(ggplot2)
library(dplyr)
library(ggrepel)

# 1. Extract UMAP Embeddings
DefaultAssay(seurat_obj) <- "dorothea"
umap_data <- Embeddings(seurat_obj, reduction = "umap")[, 1:2] %>% 
  as.data.frame() %>% 
  mutate(Cluster = as.factor(seurat_obj$seurat_clusters))

# 2. Get TF Activity Matrix
tf_matrix <- GetAssayData(seurat_obj, assay = "dorothea", layer = "data")

# 3. Compute TF Correlations with UMAP Axes
tf_cors <- data.frame(
  TF = rownames(tf_matrix),
  UMAP_1_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_1)),
  UMAP_2_cor = apply(tf_matrix, 1, function(x) cor(x, umap_data$umap_2))
)

# 4. Filter Top Drivers (Top 3 per quadrant)
umap1_top <- tf_cors %>% arrange(desc(UMAP_1_cor)) %>% head(3) %>% pull(TF)
umap1_bottom <- tf_cors %>% arrange(UMAP_1_cor) %>% head(3) %>% pull(TF)
umap2_top <- tf_cors %>% arrange(desc(UMAP_2_cor)) %>% head(3) %>% pull(TF)
umap2_bottom <- tf_cors %>% arrange(UMAP_2_cor) %>% head(3) %>% pull(TF)

top_tf_drivers <- unique(c(umap1_top, umap1_bottom, umap2_top, umap2_bottom))

# 5. Prepare Arrow Data
arrow_data <- tf_cors %>% filter(TF %in% top_tf_drivers)

# Scale arrows to fit nicely on the plot
scale_factor <- max(abs(umap_data$umap_1)) / max(abs(arrow_data$UMAP_1_cor)) * 0.7
arrow_data$UMAP_1 <- arrow_data$UMAP_1_cor * scale_factor
arrow_data$UMAP_2 <- arrow_data$UMAP_2_cor * scale_factor

# 6. Calculate Cluster Centroids for Labels
cluster_centroids <- umap_data %>%
  group_by(Cluster) %>%
  summarise(
    umap_1_center = median(umap_1),
    umap_2_center = median(umap_2)
  )

# 7. Plot UMAP Biplot with Cluster Labels
p_umap_biplot <- ggplot(umap_data, aes(x = umap_1, y = umap_2)) +
  # Points (Cells) - larger and more visible
  geom_point(aes(color = Cluster), alpha = 0.7, size = 2) +
  
  # Cluster Labels (at centroids)
  geom_text(data = cluster_centroids, 
            aes(x = umap_1_center, y = umap_2_center, label = Cluster),
            fontface = "bold", size = 6, color = "black") +
  
  # Arrows (TFs) - from center of UMAP
  geom_segment(data = arrow_data, 
               aes(x = mean(umap_data$umap_1), y = mean(umap_data$umap_2), 
                   xend = mean(umap_data$umap_1) + UMAP_1, 
                   yend = mean(umap_data$umap_2) + UMAP_2),
               arrow = arrow(length = unit(0.25, "cm")), 
               color = "darkred", linewidth = 1) +
  
  # TF Labels (using ggrepel to avoid overlap with cluster labels)
  geom_text_repel(data = arrow_data, 
                  aes(x = mean(umap_data$umap_1) + UMAP_1, 
                      y = mean(umap_data$umap_2) + UMAP_2, 
                      label = TF),
                  fontface = "bold.italic", color = "darkred", size = 4.5, 
                  box.padding = 0.6, point.padding = 0.4,
                  min.segment.length = 0.2) +
  
  # Styling
  scale_color_manual(values = Seurat::DiscretePalette(14)) +
  labs(title = "Regulatory Drivers in UMAP Space",
       subtitle = "Cluster numbers shown at centroids; TF arrows indicate regulatory associations",
       x = "UMAP_1", 
       y = "UMAP_2") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right",
    panel.grid.minor = element_blank()
  )

# Save
ggsave("Output_Figures/Supplementary_TF_UMAP_Biplot.pdf", p_umap_biplot, width = 12, height = 9)
print(p_umap_biplot)
```



# Check marker expression to validate root choice
```{r , fig.height=16, fig.width=16}
library(Seurat)
library(ggplot2)
library(dplyr)
library(pheatmap)

# Make sure SCT is default
DefaultAssay(seurat_obj) <- "SCT"

# Sézary-specific markers
sezary_markers <- list(
  early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
  malignant = c("TOX", "TWIST1", "DNM3", "STAT5A"),
  exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3"),
  proliferative = c("MKI67", "TOP2A", "PCNA")
)

all_markers <- unlist(sezary_markers)

# ============================================
# 1. DOT PLOT - ALL CLUSTERS
# ============================================
p_dot <- DotPlot(seurat_obj, 
                 features = sezary_markers,
                 assay = "SCT",  # Explicitly specify assay
                 cols = c("lightgrey", "red"),
                 dot.scale = 6) +
  coord_flip() +
  ggtitle("Sézary Markers Across All Clusters") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
        axis.text.y = element_text(size = 10)) +
  labs(x = "Markers", y = "Clusters")

print(p_dot)
ggsave("Output_Figures/Sezary_Markers_DotPlot_AllClusters.pdf", p_dot, width = 14, height = 8)

# ============================================
# 2. AVERAGE EXPRESSION (FIXED)
# ============================================
# Use AggregateExpression (recommended in Seurat v5) with correct assay
avg_exp <- AggregateExpression(seurat_obj, 
                                features = all_markers,
                                assays = "SCT",  # Specify SCT only
                                group.by = "seurat_clusters",
                                slot = "data",
                                return.seurat = FALSE)

# Extract the SCT matrix
expr_matrix <- as.matrix(avg_exp$SCT)

# Check dimensions
cat("Expression matrix dimensions:", nrow(expr_matrix), "genes x", ncol(expr_matrix), "clusters\n")
print(head(expr_matrix))

# ============================================
# 3. HEATMAP - Scaled Expression
# ============================================
# Scale by row (gene) for better visualization
expr_matrix_scaled <- t(scale(t(expr_matrix)))

# Annotation for marker categories
annotation_row <- data.frame(
  Category = rep(names(sezary_markers), times = sapply(sezary_markers, length)),
  row.names = all_markers
)

annotation_colors <- list(
  Category = c(early_memory = "#4DAF4A", 
               malignant = "#E41A1C",
               exhausted = "#377EB8",
               proliferative = "#FF7F00")
)

pdf("Output_Figures/Sezary_Markers_Heatmap_AllClusters.pdf", width = 12, height = 8)
pheatmap(expr_matrix_scaled,
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         annotation_row = annotation_row,
         annotation_colors = annotation_colors,
         color = colorRampPalette(c("navy", "white", "firebrick"))(100),
         main = "Sézary Markers - All Clusters (Scaled)",
         fontsize = 10,
         angle_col = 0)
dev.off()

# ============================================
# 4. CALCULATE CATEGORY SCORES PER CLUSTER
# ============================================
category_scores <- data.frame()

for(category in names(sezary_markers)) {
  markers_in_category <- sezary_markers[[category]]
  
  # Calculate mean expression per cluster for this category
  category_mean <- colMeans(expr_matrix[markers_in_category, , drop = FALSE])
  
  temp_df <- data.frame(
    Cluster = names(category_mean),
    Category = category,
    Score = as.numeric(category_mean)
  )
  
  category_scores <- rbind(category_scores, temp_df)
}

# Reshape for easier analysis
library(tidyr)
category_scores_wide <- category_scores %>%
  pivot_wider(names_from = Category, values_from = Score)

# Remove 'g' prefix if present (Seurat adds this)
category_scores_wide$Cluster <- gsub("^g", "", category_scores_wide$Cluster)

cat("\n=== CATEGORY SCORES BY CLUSTER ===\n")
print(category_scores_wide)

# ============================================
# 5. IDENTIFY CLUSTER PHENOTYPES
# ============================================
cat("\n=== CLUSTER PHENOTYPE CLASSIFICATION ===\n")

# Rank clusters by each category
rankings <- category_scores_wide %>%
  mutate(
    early_memory_rank = rank(-early_memory),
    malignant_rank = rank(-malignant),
    exhausted_rank = rank(-exhausted),
    proliferative_rank = rank(-proliferative)
  )

# Identify top 3 clusters per category
cat("\n✓ Top 3 STEM-LIKE clusters (Early Memory):\n")
top_memory <- rankings %>% arrange(early_memory_rank) %>% head(3) %>% 
  select(Cluster, early_memory, early_memory_rank)
print(top_memory)

cat("\n✓ Top 3 MALIGNANT clusters:\n")
top_malignant <- rankings %>% arrange(malignant_rank) %>% head(3) %>% 
  select(Cluster, malignant, malignant_rank)
print(top_malignant)

cat("\n✓ Top 3 EXHAUSTED/TERMINAL clusters:\n")
top_exhausted <- rankings %>% arrange(exhausted_rank) %>% head(3) %>% 
  select(Cluster, exhausted, exhausted_rank)
print(top_exhausted)

cat("\n✓ Top 3 PROLIFERATIVE clusters:\n")
top_prolif <- rankings %>% arrange(proliferative_rank) %>% head(3) %>% 
  select(Cluster, proliferative, proliferative_rank)
print(top_prolif)

# ============================================
# 6. BARPLOT - Category Scores
# ============================================
p_bar <- ggplot(category_scores, aes(x = gsub("^g", "", Cluster), y = Score, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c(early_memory = "#4DAF4A", 
                                malignant = "#E41A1C",
                                exhausted = "#377EB8",
                                proliferative = "#FF7F00")) +
  labs(title = "Functional Signatures Across All Clusters",
       x = "Cluster", y = "Mean Expression Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(face = "bold", size = 14),
        legend.position = "right")

print(p_bar)
ggsave("Output_Figures/Sezary_CategoryScores_Barplot.pdf", p_bar, width = 14, height = 6)

# ============================================
# 7. TRAJECTORY ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY INFERENCE RECOMMENDATION ===\n")

trajectory_guide <- category_scores_wide %>%
  mutate(
    Stemness_Score = early_memory,
    Differentiation_Score = (exhausted + malignant) / 2,
    Balance = Stemness_Score - Differentiation_Score
  ) %>%
  arrange(desc(Balance))

cat("\nClusters ranked by Stemness Score:\n")
print(trajectory_guide %>% select(Cluster, Stemness_Score, Differentiation_Score, Balance))

cat("\n✓✓✓ RECOMMENDED ROOT CLUSTER: ", 
    trajectory_guide$Cluster[1], 
    " (highest stemness, lowest differentiation)\n\n", sep = "")

cat("✓ POTENTIAL TERMINAL CLUSTERS: ", 
    paste(tail(trajectory_guide$Cluster, 3), collapse = ", "),
    " (high exhaustion/malignancy)\n\n", sep = "")

# Save summary tables
write.csv(category_scores_wide, "Output_Figures/Cluster_Phenotype_Scores.csv", row.names = FALSE)
write.csv(trajectory_guide, "Output_Figures/Trajectory_Root_Recommendations.csv", row.names = FALSE)

cat("✓ Analysis complete! Tables saved to Output_Figures/\n")


```


# ROBUST Entropy Analysis (No External Dependencies)
```{r , fig.height=16, fig.width=16}
library(Seurat)
library(ggplot2)
library(dplyr)
library(patchwork)

DefaultAssay(seurat_obj) <- "SCT"

cat("Computing entropy-based stemness metrics...\n")

# ============================================
# Function: Shannon Entropy Calculation
# ============================================
calculate_shannon_entropy <- function(expression_vector) {
  expr_nonzero <- expression_vector[expression_vector > 0]
  if(length(expr_nonzero) == 0) return(0)
  
  probs <- expr_nonzero / sum(expr_nonzero)
  entropy <- -sum(probs * log2(probs))
  return(entropy)
}

# ============================================
# Function: Gini Coefficient (Specialization)
# ============================================
calculate_gini <- function(x) {
  x <- x[x > 0]
  if(length(x) == 0) return(0)
  
  n <- length(x)
  x_sorted <- sort(x)
  gini <- (2 * sum((1:n) * x_sorted)) / (n * sum(x_sorted)) - (n + 1) / n
  return(gini)
}

# ============================================
# STEP 1: Calculate Entropy Per Cell
# ============================================
expr_matrix <- GetAssayData(seurat_obj, assay = "SCT", layer = "data")
hvgs <- VariableFeatures(seurat_obj)
expr_hvg <- expr_matrix[hvgs, ]

cat("Calculating entropy for", ncol(expr_hvg), "cells using", length(hvgs), "HVGs...\n")

# Shannon Entropy
entropy_scores <- apply(expr_hvg, 2, calculate_shannon_entropy)
seurat_obj$Shannon_Entropy <- entropy_scores

# Gini Index
gini_scores <- apply(expr_hvg, 2, calculate_gini)
seurat_obj$Gini_Index <- gini_scores

# Transcriptional Diversity Score
seurat_obj$Diversity_Score <- seurat_obj$nFeature_RNA / max(seurat_obj$nFeature_RNA)

# ============================================
# STEP 2: Combined Stemness Score
# ============================================
# Normalize and combine metrics
seurat_obj$Stemness_Score <- 
  scales::rescale(seurat_obj$Shannon_Entropy, to = c(0, 1)) * 0.4 +
  (1 - scales::rescale(seurat_obj$Gini_Index, to = c(0, 1))) * 0.3 +  # Invert Gini
  seurat_obj$Diversity_Score * 0.3

# ============================================
# STEP 3: Per-Cluster Analysis
# ============================================
cluster_potency <- seurat_obj@meta.data %>%
  group_by(seurat_clusters) %>%
  summarize(
    n_cells = n(),
    mean_entropy = mean(Shannon_Entropy),
    mean_gini = mean(Gini_Index),
    mean_diversity = mean(Diversity_Score),
    mean_stemness = mean(Stemness_Score),
    sd_stemness = sd(Stemness_Score)
  ) %>%
  arrange(desc(mean_stemness))

cat("\n=== STEMNESS/POTENCY SCORES BY CLUSTER ===\n")
print(cluster_potency)

# ============================================
# VISUALIZATION 1: Stemness Score Barplot
# ============================================
p1 <- ggplot(cluster_potency, 
             aes(x = reorder(seurat_clusters, -mean_stemness), 
                 y = mean_stemness,
                 fill = mean_stemness)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = mean_stemness - sd_stemness,
                    ymax = mean_stemness + sd_stemness),
                width = 0.3, alpha = 0.5) +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(title = "Entropy-Based Stemness Score by Cluster",
       subtitle = "Combined metric: Shannon entropy + Gini + Diversity",
       x = "Cluster", y = "Mean Stemness Score",
       fill = "Stemness") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14))

print(p1)

# ============================================
# VISUALIZATION 2: Stemness on UMAP
# ============================================
p2 <- FeaturePlot(seurat_obj, 
                  features = "Stemness_Score",
                  reduction = "umap",
                  pt.size = 1.5) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Stemness Score in UMAP Space",
       subtitle = "Red = High stemness/potency")

print(p2)

# ============================================
# VISUALIZATION 3: Component Scores Heatmap
# ============================================
library(reshape2)
heatmap_data <- cluster_potency %>%
  select(seurat_clusters, mean_entropy, mean_gini, mean_diversity) %>%
  mutate(mean_gini = 1 - mean_gini) %>%  # Invert for consistency
  mutate(across(c(mean_entropy, mean_gini, mean_diversity), 
                ~scales::rescale(.x, to = c(0, 1)))) %>%
  melt(id.vars = "seurat_clusters")

p3 <- ggplot(heatmap_data, aes(x = variable, y = seurat_clusters, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                       midpoint = 0.5) +
  labs(title = "Component Metrics of Stemness",
       x = "Metric", y = "Cluster", fill = "Normalized\nScore") +
  scale_x_discrete(labels = c("Entropy", "Specialization", "Diversity")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p3)

# ============================================
# VISUALIZATION 4: Violin Plot
# ============================================
p4 <- VlnPlot(seurat_obj, 
              features = "Stemness_Score",
              pt.size = 0) +
  labs(title = "Stemness Score Distribution",
       x = "Cluster", y = "Stemness Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p4)

# ============================================
# INTEGRATE WITH SÉZARY MARKERS
# ============================================
sezary_markers <- list(
  early_memory = c("CCR7", "SELL", "TCF7", "LEF1"),
  exhausted = c("PDCD1", "CTLA4", "TIGIT", "LAG3")
)

seurat_obj <- AddModuleScore(seurat_obj, 
                             features = sezary_markers,
                             name = c("EarlyMemory", "Exhausted"))

cor_memory <- cor(seurat_obj$Stemness_Score, seurat_obj$EarlyMemory1, use = "complete.obs")
cor_exhausted <- cor(seurat_obj$Stemness_Score, seurat_obj$Exhausted2, use = "complete.obs")

cat("\n=== VALIDATION AGAINST MARKER GENES ===\n")
cat("Correlation Stemness vs Early Memory:", round(cor_memory, 3), "\n")
cat("Correlation Stemness vs Exhaustion:", round(cor_exhausted, 3), "\n")

p5 <- ggplot(seurat_obj@meta.data, 
             aes(x = Stemness_Score, y = EarlyMemory1, color = seurat_clusters)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "black") +
  labs(title = "Stemness Score vs Early Memory Markers",
       subtitle = paste0("r = ", round(cor_memory, 3)),
       x = "Stemness Score", y = "Early Memory Score") +
  theme_minimal()

print(p5)

# ============================================
# ROOT RECOMMENDATION
# ============================================
cat("\n=== TRAJECTORY ROOT RECOMMENDATION ===\n\n")
cat("✓✓✓ HIGHEST STEMNESS/POTENCY CLUSTER: ", cluster_potency$seurat_clusters, "\n")[1]
cat("     Stemness Score: ", round(cluster_potency$mean_stemness, 3), "\n\n")[1]

cat("✓ Top 3 stem-like clusters:\n")
print(cluster_potency %>% head(3) %>% select(seurat_clusters, mean_stemness))

cat("\n✓ Top 3 differentiated clusters:\n")
print(cluster_potency %>% tail(3) %>% select(seurat_clusters, mean_stemness))

# ============================================
# SAVE RESULTS
# ============================================
write.csv(cluster_potency, 
          "Output_Figures/Stemness_Potency_ByCluster.csv", 
          row.names = FALSE)

# Combined figure
p_combined <- (p1 | p2) / (p3 | p4) + 
  plot_annotation(title = "Entropy-Based Stemness Analysis",
                  subtitle = "Root-free differentiation state assessment",
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

print(p_combined)
ggsave("Output_Figures/Stemness_Analysis_Complete.pdf", p_combined, 
       width = 16, height = 12)

cat("\n✓ Analysis complete! All results saved to Output_Figures/\n")
```




```{r , fig.height=16, fig.width=16}


```


