1. load libraries

2. Load QC seurat Object


ss_Harro <- readRDS("ss_Harro_SS_4_MF_3_QC_object.rds")

3. Log-normalization (scale factor = 10,000) and Integration



# 1. Split Seurat object by sample
ss_list <- SplitObject(ss_Harro, split.by = "orig.ident")

# 2. Log-normalization + variable feature selection + cell cycle scoring
ss_list <- lapply(ss_list, function(x) {
  x <- NormalizeData(x, normalization.method = "LogNormalize", scale.factor = 10000)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000)
  x <- CellCycleScoring(x, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
  return(x)
})
Normalizing layer: counts.SS_P1.SeuratProject.SeuratProject.SeuratProject.SeuratProject.SeuratProject
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.SS_P1.SeuratProject.SeuratProject.SeuratProject.SeuratProject.SeuratProject
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Avis : The following features are not present in the object: MLF1IP, E2F8, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, CDC25C, GAS2L3, not searching for symbol synonymsNormalizing layer: counts.SS_P2.SeuratProject.SeuratProject.SeuratProject.SeuratProject.SeuratProject
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.SS_P2.SeuratProject.SeuratProject.SeuratProject.SeuratProject.SeuratProject
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The following features are not present in the object: MLF1IP, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, GAS2L3, not searching for symbol synonymsNormalizing layer: counts.SS_P3.SeuratProject.SeuratProject.SeuratProject.SeuratProject
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.SS_P3.SeuratProject.SeuratProject.SeuratProject.SeuratProject
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The following features are not present in the object: MLF1IP, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, not searching for symbol synonymsNormalizing layer: counts.SS_P4.SeuratProject.SeuratProject.SeuratProject
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.SS_P4.SeuratProject.SeuratProject.SeuratProject
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The following features are not present in the object: MLF1IP, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, CDC25C, NEK2, GAS2L3, not searching for symbol synonymsNormalizing layer: counts.MF_P1.SeuratProject.SeuratProject
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF_P1.SeuratProject.SeuratProject
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The following features are not present in the object: MLF1IP, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, not searching for symbol synonymsNormalizing layer: counts.MF_P2.SeuratProject
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF_P2.SeuratProject
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The following features are not present in the object: MLF1IP, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, GAS2L3, not searching for symbol synonymsNormalizing layer: counts.MF_P3
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF_P3
Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Avis : The following features are not present in the object: MLF1IP, not searching for symbol synonymsAvis : The following features are not present in the object: FAM64A, HN1, not searching for symbol synonyms
# 3. Intersect genes across all objects to ensure consistency
common_genes <- Reduce(intersect, lapply(ss_list, rownames))
ss_list <- lapply(ss_list, function(x) {
  x <- subset(x, features = common_genes)
  return(x)
})

# 4. Remove TCR/Ig genes from variable features
tcr_genes <- grep("^TR[ABGD]|^IG[HKL]", common_genes, value = TRUE)
ss_list <- lapply(ss_list, function(x) {
  vgs <- VariableFeatures(x)
  vgs <- setdiff(vgs, tcr_genes)
  VariableFeatures(x) <- vgs
  return(x)
})

# 5. Run PCA on each object (required for RPCA)
ss_list <- lapply(ss_list, function(x) {
  x <- ScaleData(x, features = VariableFeatures(x), verbose = FALSE)
  x <- RunPCA(x, features = VariableFeatures(x), verbose = FALSE)
  return(x)
})

# 6. Select integration features
features <- SelectIntegrationFeatures(object.list = ss_list, nfeatures = 8000)

# 7. Find integration anchors using RPCA
anchors <- FindIntegrationAnchors(
  object.list = ss_list,
  anchor.features = features,
  reduction = "rpca",
  dims = 1:40
)
Scaling features for provided objects

  |                                                  | 0 % ~calculating  
Avis : Different features in new layer data than already exists for scale.data

  |++++++++                                          | 14% ~32s          
Avis : Different features in new layer data than already exists for scale.data

  |+++++++++++++++                                   | 29% ~21s          
Avis : Different features in new layer data than already exists for scale.data

  |++++++++++++++++++++++                            | 43% ~22s          
Avis : Different features in new layer data than already exists for scale.data

  |+++++++++++++++++++++++++++++                     | 57% ~14s          
Avis : Different features in new layer data than already exists for scale.data

  |++++++++++++++++++++++++++++++++++++              | 71% ~14s          
Avis : Different features in new layer data than already exists for scale.data

  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~07s          
Avis : Different features in new layer data than already exists for scale.data

  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=47s  
Computing within dataset neighborhoods

  |                                                  | 0 % ~calculating  
  |++++++++                                          | 14% ~43s          
  |+++++++++++++++                                   | 29% ~26s          
  |++++++++++++++++++++++                            | 43% ~28s          
  |+++++++++++++++++++++++++++++                     | 57% ~19s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~19s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~09s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01m 02s
Finding all pairwise anchors

  |                                                  | 0 % ~calculating  
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 884 anchors

  |+++                                               | 5 % ~13m 19s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 1521 anchors

  |+++++                                             | 10% ~15m 36s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 944 anchors

  |++++++++                                          | 14% ~14m 48s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 1093 anchors

  |++++++++++                                        | 19% ~13m 09s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 743 anchors

  |++++++++++++                                      | 24% ~11m 20s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 1118 anchors

  |+++++++++++++++                                   | 29% ~10m 43s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 984 anchors

  |+++++++++++++++++                                 | 33% ~11m 53s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 735 anchors

  |++++++++++++++++++++                              | 38% ~12m 01s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 1102 anchors

  |++++++++++++++++++++++                            | 43% ~12m 24s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 898 anchors

  |++++++++++++++++++++++++                          | 48% ~11m 50s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 907 anchors

  |+++++++++++++++++++++++++++                       | 52% ~10m 44s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 707 anchors

  |+++++++++++++++++++++++++++++                     | 57% ~09m 28s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 906 anchors

  |+++++++++++++++++++++++++++++++                   | 62% ~08m 27s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 828 anchors

  |++++++++++++++++++++++++++++++++++                | 67% ~07m 16s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 4606 anchors

  |++++++++++++++++++++++++++++++++++++              | 71% ~06m 32s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 891 anchors

  |+++++++++++++++++++++++++++++++++++++++           | 76% ~05m 22s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 566 anchors

  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~04m 11s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 773 anchors

  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~03m 07s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 720 anchors

  |++++++++++++++++++++++++++++++++++++++++++++++    | 90% ~02m 01s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 1315 anchors

  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~01m 03s      
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 1375 anchors

  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=21m 48s
# 8. Integrate data
ss_integrated <- IntegrateData(anchorset = anchors, dims = 1:40)
Merging dataset 6 into 5
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Avis : Layer counts isn't present in the assay object; returning NULLMerging dataset 4 into 3
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Avis : Layer counts isn't present in the assay object; returning NULLMerging dataset 2 into 1
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Avis : Layer counts isn't present in the assay object; returning NULLMerging dataset 7 into 5 6
Extracting anchors for merged samples
Finding integration vectors
Avis : Different cells in new layer data than already exists for scale.dataFinding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Avis : Layer counts isn't present in the assay object; returning NULLMerging dataset 1 2 into 3 4
Extracting anchors for merged samples
Finding integration vectors
Avis : The `slot` argument of `SetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Avis : Layer counts isn't present in the assay object; returning NULLMerging dataset 3 4 1 2 into 5 6 7
Extracting anchors for merged samples
Finding integration vectors

4. PCA + UMAP


ss_Harro <- ss_integrated

# PCA
ss_Harro <- RunPCA(ss_Harro, npcs = 50)

# Optional: Visualize elbow plot
ElbowPlot(ss_Harro, ndims = 50)

5. PCA Visualization




# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- ss_Harro[["pca"]]@stdev / sum(ss_Harro[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- ss_Harro[["pca"]]@stdev
sum.stdv <- sum(ss_Harro[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

6. Clustering (resolution = 1.0)

# Then find neighbors & clusters
ss_Harro <- FindNeighbors(ss_Harro, dims = 1:40)
ss_Harro <- FindClusters(ss_Harro, resolution = 1.0)

# run UMAP
ss_Harro <- RunUMAP(ss_Harro, dims = 1:40)

ss_Harro <-RunTSNE(ss_Harro, dims.use = 1:40)

7. Visualize UMAP with Clusters


DimPlot(ss_Harro, reduction = "umap",group.by = "orig.ident", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "umap", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "tsne", label = TRUE, pt.size = 0.6) +
  ggtitle("TSNE of Sézary Syndrome CD4+ T Cells")

Save the Seurat object as an RDS



saveRDS(ss_Harro, file = "ss_Harro_SS_4_MF_3_Integrated_object_before_featureplot.rds")

8. FeaturePlots for Top50 UP

top_50_up <- read.csv("top_50_upregulated.csv")        # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("top_50_downregulated.csv")



Idents(ss_Harro) <- "seurat_clusters"


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_up$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

9. FeaturePlots for Top50 DOWN


FeaturePlot(ss_Harro, 
             features = top_50_down$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_down$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_down$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

Visualization



DimPlot(ss_Harro, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")

Visualization of Potential biomarkers-Upregulated


DefaultAssay(ss_Harro) <- "RNA"
Idents(ss_Harro) <- "Disease_state"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "orig.ident"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "seurat_clusters"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Visualization of Potential biomarkers-Downregulated


Idents(ss_Harro) <- "Disease_state"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )


Idents(ss_Harro) <- "orig.ident"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
Idents(ss_Harro) <- "seurat_clusters"

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Save the Seurat object as an RDS



saveRDS(ss_Harro, file = "ss_Harro_SS_4_MF_3_Integrated_object_after_featureplot_final.rds")
---
title: "Potential biomarkers Validation (Harro_4_SS_and_3_MF_Patient_Samples)-Integration"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}

library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(Azimuth)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(rmarkdown)
library(tinytex)


library(dplyr)
library(dittoSeq)
library(ggrepel)
#library(ggtree)
library(parallel)
library(plotly)  # 3D plot
library(Seurat)  # Idents()
library(SeuratDisk)  # SaveH5Seurat()
library(tibble)  # rownnames_to_column
library(harmony) # RunHarmony()
#options(mc.cores = detectCores() - 1)

library(dplyr)
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(magrittr)
library(dbplyr)
library(rmarkdown)
library(knitr)
library(tinytex)
#Azimuth Annotation libraries
library(Azimuth)
#ProjecTils Annotation libraries
library(STACAS)
library(ProjecTILs)
#singleR Annotation libraries

library(SingleCellExperiment)

```


# 2. Load QC seurat Object
```{r loadSeurat}

ss_Harro <- readRDS("ss_Harro_SS_4_MF_3_QC_object.rds")

```


# 3. Log-normalization (scale factor = 10,000) and Integration
```{r PCA, fig.height=6, fig.width=10}


# 1. Split Seurat object by sample
ss_list <- SplitObject(ss_Harro, split.by = "orig.ident")

# 2. Intersect genes across all objects to ensure consistency
common_genes <- Reduce(intersect, lapply(ss_list, rownames))
ss_list <- lapply(ss_list, function(x) {
  x <- subset(x, features = common_genes)
  return(x)
})

# 3. Log-normalization + variable feature selection + cell cycle scoring
ss_list <- lapply(ss_list, function(x) {
  x <- NormalizeData(x, normalization.method = "LogNormalize", scale.factor = 10000)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000)
  x <- CellCycleScoring(x, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
  return(x)
})

# 4. Remove canonical TCR/Ig genes from variable features (safe and specific regex)
tcr_ig_genes <- grep(
  "^TRA[VJD]|^TRB[VJD]|^TRD[VJD]|^TRG[VJD]|^IGH[VDJ]|^IGK[VDJ]|^IGL[VDJ]",
  common_genes,
  value = TRUE
)
ss_list <- lapply(ss_list, function(x) {
  vgs <- VariableFeatures(x)
  vgs <- setdiff(vgs, tcr_ig_genes)
  VariableFeatures(x) <- vgs
  return(x)
})

# 5. Run PCA on each object (required for RPCA)
ss_list <- lapply(ss_list, function(x) {
  x <- ScaleData(x, features = VariableFeatures(x), verbose = FALSE)
  x <- RunPCA(x, features = VariableFeatures(x), verbose = FALSE)
  return(x)
})

# 6. Select integration features
features <- SelectIntegrationFeatures(object.list = ss_list, nfeatures = 8000)

# 7. Find integration anchors using RPCA
anchors <- FindIntegrationAnchors(
  object.list = ss_list,
  anchor.features = features,
  reduction = "rpca",
  dims = 1:40
)

# 8. Integrate data
ss_integrated <- IntegrateData(anchorset = anchors, dims = 1:40)

# 9. Set the default assay to "integrated"
DefaultAssay(ss_integrated) <- "integrated"

# 10. Scale integrated data and regress out unwanted variation
ss_integrated <- ScaleData(
  ss_integrated,
  vars.to.regress = c("nCount_RNA", "percent.mt", "S.Score", "G2M.Score")
)

```


# 4. PCA + UMAP
```{r Elbow, fig.height=6, fig.width=10}

ss_Harro <- ss_integrated

# PCA
ss_Harro <- RunPCA(ss_Harro, npcs = 50)

# Optional: Visualize elbow plot
ElbowPlot(ss_Harro, ndims = 50)

```

# 5. PCA Visualization
```{r PCA-TEST, fig.height=6, fig.width=10}



# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- ss_Harro[["pca"]]@stdev / sum(ss_Harro[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- ss_Harro[["pca"]]@stdev
sum.stdv <- sum(ss_Harro[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

```

# 6. Clustering (resolution = 1.0)
```{r Clustering, fig.height=6, fig.width=10}
# Then find neighbors & clusters
ss_Harro <- FindNeighbors(ss_Harro, dims = 1:40)
ss_Harro <- FindClusters(ss_Harro, resolution = 1.0)

# run UMAP
ss_Harro <- RunUMAP(ss_Harro, dims = 1:40)

ss_Harro <-RunTSNE(ss_Harro, dims.use = 1:40)

```


# 7. Visualize UMAP with Clusters
```{r UMAP, fig.height=6, fig.width=10}

DimPlot(ss_Harro, reduction = "umap",group.by = "orig.ident", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "umap", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "tsne", label = TRUE, pt.size = 0.6) +
  ggtitle("TSNE of Sézary Syndrome CD4+ T Cells")

```

## Save the Seurat object as an RDS
```{r save1}


saveRDS(ss_Harro, file = "ss_Harro_SS_4_MF_3_Integrated_object_before_featureplot.rds")


```



# 8.  FeaturePlots for Top50 UP
```{r FeaturePlot1, fig.height=16, fig.width=20}
top_50_up <- read.csv("top_50_upregulated.csv")        # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("top_50_downregulated.csv")



Idents(ss_Harro) <- "seurat_clusters"


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_up$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

```



# 9.  FeaturePlots for Top50 DOWN
```{r FeaturePlot2, fig.height=16, fig.width=20}

FeaturePlot(ss_Harro, 
             features = top_50_down$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_down$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_down$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
```

## Visualization
```{r umapvis, fig.height=6, fig.width=10}


DimPlot(ss_Harro, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")


```

## Visualization of Potential biomarkers-Upregulated
```{r biomarkersvis, fig.height=6, fig.width=10}

DefaultAssay(ss_Harro) <- "RNA"
Idents(ss_Harro) <- "Disease_state"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "orig.ident"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "seurat_clusters"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )


```



## Visualization of Potential biomarkers-Downregulated
```{r vis2, fig.height=6, fig.width=10}

Idents(ss_Harro) <- "Disease_state"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )


Idents(ss_Harro) <- "orig.ident"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
Idents(ss_Harro) <- "seurat_clusters"

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
```

## Save the Seurat object as an RDS
```{r saveRDSFinal}


saveRDS(ss_Harro, file = "ss_Harro_SS_4_MF_3_Integrated_object_after_featureplot_final.rds")


```

