1. load libraries

2. Load Data into Seurat


library(Seurat)
library(dplyr)

# Set directory paths
ss_dir <- "data/SS_Patients/"
MF_dir <- "data/MF_Patients/"

ss_dirs <- sort(list.dirs(ss_dir, full.names = TRUE, recursive = FALSE))
mf_dirs <- sort(list.dirs(MF_dir, full.names = TRUE, recursive = FALSE))

ss_names <- paste0("SS_P", 1:4)
mf_names <- paste0("MF_P", 1:3)

# Initialize list for Seurat objects
seurat_objects <- list()

# Load SS samples
for (i in seq_along(ss_dirs)) {
  dat <- Read10X(data.dir = ss_dirs[i])
  seu <- CreateSeuratObject(counts = dat, project = ss_names[i], min.cells = 3, min.features = 200)
  seu$sample <- ss_names[i]
  seu <- RenameCells(seu, add.cell.id = ss_names[i])
  seu$Disease_state <- "Sézary"
  seu$Patient_ID <- gsub("^SS_P", "P", ss_names[i])
  seurat_objects[[ss_names[i]]] <- seu
}

# Load MF samples
for (i in seq_along(mf_dirs)) {
  dat <- Read10X(data.dir = mf_dirs[i])
  seu <- CreateSeuratObject(counts = dat, project = mf_names[i], min.cells = 3, min.features = 200)
  seu$sample <- mf_names[i]
  seu <- RenameCells(seu, add.cell.id = mf_names[i])
  seu$Disease_state <- "MF"
  seu$Patient_ID <- gsub("^MF_P", "P", mf_names[i])
  seurat_objects[[mf_names[i]]] <- seu
}

# Merge all Seurat objects
all_seurat <- Reduce(function(x, y) merge(x, y), seurat_objects)

# Basic QC: Filter low-quality cells and high mitochondrial content
all_seurat[["percent.mt"]] <- PercentageFeatureSet(all_seurat, pattern = "^MT-")
all_seurat <- subset(all_seurat, subset = nFeature_RNA > 200 & percent.mt < 10)

# Store main object
ss_Harro <- all_seurat

3. QC


VlnPlot(ss_Harro, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)
Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

VlnPlot(ss_Harro, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))
Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.Warning: The following requested variables were not found: percent.rbRasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

FeatureScatter(ss_Harro, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.


FeatureScatter(ss_Harro, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

FeatureScatter(ss_Harro, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

##Read Seurat object with load as you save it with save() function


All_samples_Merged <- readRDS("../0-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_Harmony_integrated_Cell_line_renamed_03-07-2025.rds")

Save the Seurat object as an RDS



saveRDS(ss_Harro, file = "data/ss_Harro_SS_4_MF_3_QC_object.rds")

4. Log-normalization (scale factor = 10,000) and Integration



library(Seurat)

# Normalize (log-normalization), identify top 5000 variable genes
ss_list <- SplitObject(ss_Harro, split.by = "orig.ident")
ss_list <- lapply(ss_list, function(x) {
  x <- NormalizeData(x, normalization.method = "LogNormalize", scale.factor = 10000)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000)
  x <- CellCycleScoring(x, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
  return(x)
})

# Remove TCR/Ig genes from variable features
tcr_genes <- grep("^TR[ABGD]|^IG[HKL]", rownames(ss_list[[1]]), value = TRUE)
ss_list <- lapply(ss_list, function(x) {
  vgs <- VariableFeatures(x)
  vgs <- setdiff(vgs, tcr_genes)
  VariableFeatures(x) <- vgs
  return(x)
})


# Integration using 8000 anchors and CCA (dims = 40)
features <- SelectIntegrationFeatures(object.list = ss_list, nfeatures = 8000)
anchors <- FindIntegrationAnchors(object.list = ss_list, anchor.features = features, reduction = "cca", dims = 1:40)
ss_integrated <- IntegrateData(anchorset = anchors, dims = 1:40)

# Set integrated assay as default
DefaultAssay(ss_integrated) <- "integrated"


# ScaleData with regressions
ss_integrated <- ScaleData(ss_integrated, vars.to.regress = c("nCount_RNA", "percent.mt", "S.Score", "G2M.Score"))

5. PCA + UMAP


ss_Harro <- ss_integrated

# PCA
ss_Harro <- RunPCA(ss_Harro, npcs = 50)




# Optional: Visualize elbow plot
ElbowPlot(ss_Harro, ndims = 50)

# # Run JackStraw analysis properly
# ss_Harro <- JackStraw(ss_Harro, num.replicate = 100)
# ss_Harro <- ScoreJackStraw(ss_Harro, dims = 1:20)
# 
# # Visualize JackStraw scores
# JackStrawPlot(ss_Harro, dims = 1:20)

6. PCA Visualization




# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- ss_Harro[["pca"]]@stdev / sum(ss_Harro[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- ss_Harro[["pca"]]@stdev
sum.stdv <- sum(ss_Harro[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

7. Clustering (resolution = 1.2)

# Then find neighbors & clusters
ss_Harro <- FindNeighbors(ss_Harro, dims = 1:40)
ss_Harro <- FindClusters(ss_Harro, resolution = 1.0)

# run UMAP
ss_Harro <- RunUMAP(ss_Harro, dims = 1:40)

ss_Harro <-RunTSNE(ss_Harro, dims.use = 1:40)

8. Visualize UMAP with Clusters


DimPlot(ss_Harro, reduction = "umap",group.by = "orig.ident", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "umap", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "tsne", label = TRUE, pt.size = 0.6) +
  ggtitle("TSNE of Sézary Syndrome CD4+ T Cells")

9. FeaturePlots for Top50 UP

top_50_up <- read.csv("../Data_ss_Harro_2019/top_50_upregulated.csv")        # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("../Data_ss_Harro_2019/top_50_downregulated.csv")

Idents(ss_Harro) <- "seurat_clusters"


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_up$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

10. FeaturePlots for Top50 DOWN


FeaturePlot(ss_Harro, 
             features = top_50_down$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_down$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_down$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

Visualization



DimPlot(ss_Harro, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")

Visualization of Potential biomarkers-Upregulated


DefaultAssay(ss_Harro) <- "RNA"
Idents(ss_Harro) <- "Disease_state"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "orig.ident"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "seurat_clusters"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Visualization of Potential biomarkers-Downregulated


Idents(ss_Harro) <- "Disease_state"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )


Idents(ss_Harro) <- "orig.ident"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
Idents(ss_Harro) <- "seurat_clusters"

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Save the Seurat object as an RDS



saveRDS(ss_Harro, file = "data/ss_Harro_SS_4_MF_3_Integrated_object.rds")
---
title: "Potential biomarkers Validation (Harro_4_SS_and_3_MF_Patient_Samples)"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}

library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(Azimuth)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(rmarkdown)
library(tinytex)


library(dplyr)
library(dittoSeq)
library(ggrepel)
#library(ggtree)
library(parallel)
library(plotly)  # 3D plot
library(Seurat)  # Idents()
library(SeuratDisk)  # SaveH5Seurat()
library(tibble)  # rownnames_to_column
library(harmony) # RunHarmony()
#options(mc.cores = detectCores() - 1)

library(dplyr)
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(magrittr)
library(dbplyr)
library(rmarkdown)
library(knitr)
library(tinytex)
#Azimuth Annotation libraries
library(Azimuth)
#ProjecTils Annotation libraries
library(STACAS)
library(ProjecTILs)
#singleR Annotation libraries

library(SingleCellExperiment)

```


# 2. Load Data into Seurat
```{r load_seurat}

library(Seurat)
library(dplyr)

# Set directory paths
ss_dir <- "data/SS_Patients/"
MF_dir <- "data/MF_Patients/"

ss_dirs <- sort(list.dirs(ss_dir, full.names = TRUE, recursive = FALSE))
mf_dirs <- sort(list.dirs(MF_dir, full.names = TRUE, recursive = FALSE))

ss_names <- paste0("SS_P", 1:4)
mf_names <- paste0("MF_P", 1:3)

# Initialize list for Seurat objects
seurat_objects <- list()

# Load SS samples
for (i in seq_along(ss_dirs)) {
  dat <- Read10X(data.dir = ss_dirs[i])
  seu <- CreateSeuratObject(counts = dat, project = ss_names[i], min.cells = 3, min.features = 200)
  seu$sample <- ss_names[i]
  seu <- RenameCells(seu, add.cell.id = ss_names[i])
  seu$Disease_state <- "Sézary"
  seu$Patient_ID <- gsub("^SS_P", "P", ss_names[i])
  seurat_objects[[ss_names[i]]] <- seu
}

# Load MF samples
for (i in seq_along(mf_dirs)) {
  dat <- Read10X(data.dir = mf_dirs[i])
  seu <- CreateSeuratObject(counts = dat, project = mf_names[i], min.cells = 3, min.features = 200)
  seu$sample <- mf_names[i]
  seu <- RenameCells(seu, add.cell.id = mf_names[i])
  seu$Disease_state <- "MF"
  seu$Patient_ID <- gsub("^MF_P", "P", mf_names[i])
  seurat_objects[[mf_names[i]]] <- seu
}

# Merge all Seurat objects
all_seurat <- Reduce(function(x, y) merge(x, y), seurat_objects)

# Basic QC: Filter low-quality cells and high mitochondrial content
all_seurat[["percent.mt"]] <- PercentageFeatureSet(all_seurat, pattern = "^MT-")
all_seurat <- subset(all_seurat, subset = nFeature_RNA > 200 & percent.mt < 10)

# Store main object
ss_Harro <- all_seurat

```

# 3. QC
```{r QC, fig.height=6, fig.width=10}

VlnPlot(ss_Harro, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)

VlnPlot(ss_Harro, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))

FeatureScatter(ss_Harro, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')

```


##FeatureScatter is typically used to visualize feature-feature relationships
##for anything calculated by the object, 
##i.e. columns in object metadata, PC scores etc.

```{r FC, fig.height=6, fig.width=10}

FeatureScatter(ss_Harro, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')

FeatureScatter(ss_Harro, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

```

##Read Seurat object with load as you save it with save() function

```{r loadSeurat}

All_samples_Merged <- readRDS("../0-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_Harmony_integrated_Cell_line_renamed_03-07-2025.rds")

```


## Save the Seurat object as an RDS
```{r saveRDS}


saveRDS(ss_Harro, file = "data/ss_Harro_SS_4_MF_3_QC_object.rds")


```



# 4. Log-normalization (scale factor = 10,000) and Integration
```{r PCA, fig.height=6, fig.width=10}


library(Seurat)

# Normalize (log-normalization), identify top 5000 variable genes
ss_list <- SplitObject(ss_Harro, split.by = "orig.ident")
ss_list <- lapply(ss_list, function(x) {
  x <- NormalizeData(x, normalization.method = "LogNormalize", scale.factor = 10000)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000)
  x <- CellCycleScoring(x, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
  return(x)
})

# Remove TCR/Ig genes from variable features
tcr_genes <- grep("^TR[ABGD]|^IG[HKL]", rownames(ss_list[[1]]), value = TRUE)
ss_list <- lapply(ss_list, function(x) {
  vgs <- VariableFeatures(x)
  vgs <- setdiff(vgs, tcr_genes)
  VariableFeatures(x) <- vgs
  return(x)
})


# Integration using 8000 anchors and CCA (dims = 40)
features <- SelectIntegrationFeatures(object.list = ss_list, nfeatures = 8000)
anchors <- FindIntegrationAnchors(object.list = ss_list, anchor.features = features, reduction = "cca", dims = 1:40)
ss_integrated <- IntegrateData(anchorset = anchors, dims = 1:40)

# Set integrated assay as default
DefaultAssay(ss_integrated) <- "integrated"


# ScaleData with regressions
ss_integrated <- ScaleData(ss_integrated, vars.to.regress = c("nCount_RNA", "percent.mt", "S.Score", "G2M.Score"))


```


# 5. PCA + UMAP
```{r , fig.height=6, fig.width=10}

ss_Harro <- ss_integrated

# PCA
ss_Harro <- RunPCA(ss_Harro, npcs = 50)




# Optional: Visualize elbow plot
ElbowPlot(ss_Harro, ndims = 50)

# # Run JackStraw analysis properly
# ss_Harro <- JackStraw(ss_Harro, num.replicate = 100)
# ss_Harro <- ScoreJackStraw(ss_Harro, dims = 1:20)
# 
# # Visualize JackStraw scores
# JackStrawPlot(ss_Harro, dims = 1:20)

```

# 6. PCA Visualization
```{r PCA-TEST2, fig.height=6, fig.width=10}



# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- ss_Harro[["pca"]]@stdev / sum(ss_Harro[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- ss_Harro[["pca"]]@stdev
sum.stdv <- sum(ss_Harro[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

```

# 7. Clustering (resolution = 1.2)
```{r , fig.height=6, fig.width=10}
# Then find neighbors & clusters
ss_Harro <- FindNeighbors(ss_Harro, dims = 1:40)
ss_Harro <- FindClusters(ss_Harro, resolution = 1.0)

# run UMAP
ss_Harro <- RunUMAP(ss_Harro, dims = 1:40)

ss_Harro <-RunTSNE(ss_Harro, dims.use = 1:40)

```


# 8. Visualize UMAP with Clusters
```{r , fig.height=6, fig.width=10}

DimPlot(ss_Harro, reduction = "umap",group.by = "orig.ident", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "umap", label = TRUE,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(ss_Harro, reduction = "tsne", label = TRUE, pt.size = 0.6) +
  ggtitle("TSNE of Sézary Syndrome CD4+ T Cells")

```

# 9.  FeaturePlots for Top50 UP
```{r , fig.height=16, fig.width=20}
top_50_up <- read.csv("../Data_ss_Harro_2019/top_50_upregulated.csv")        # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("../Data_ss_Harro_2019/top_50_downregulated.csv")

Idents(ss_Harro) <- "seurat_clusters"


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_up$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_up$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_up$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

```



# 10.  FeaturePlots for Top50 DOWN
```{r , fig.height=16, fig.width=20}

FeaturePlot(ss_Harro, 
             features = top_50_down$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(ss_Harro, 
             features = top_50_down$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(ss_Harro, 
             features = top_50_down$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(ss_Harro, 
             features = top_50_down$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
```

## Visualization
```{r , fig.height=6, fig.width=10}


DimPlot(ss_Harro, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")


```

## Visualization of Potential biomarkers-Upregulated
```{r , fig.height=6, fig.width=10}

DefaultAssay(ss_Harro) <- "RNA"
Idents(ss_Harro) <- "Disease_state"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "orig.ident"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Idents(ss_Harro) <- "seurat_clusters"

# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(ss_Harro, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )


```



## Visualization of Potential biomarkers-Downregulated
```{r C2, fig.height=6, fig.width=10}

Idents(ss_Harro) <- "Disease_state"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )


Idents(ss_Harro) <- "orig.ident"
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
Idents(ss_Harro) <- "seurat_clusters"

# DotPlot with firebrick color for high expression
DotPlot(ss_Harro, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
```

## Save the Seurat object as an RDS
```{r save}


saveRDS(ss_Harro, file = "data/ss_Harro_SS_4_MF_3_Integrated_object.rds")


```

