1. load libraries

2. Load Data into Seurat


# Patient IDs and H5 files (replace with your actual file paths)

# Define paths to your H5 files
h5_files <- list(
  "Patient1" = "data/GSM5687767_SZ22raw_feature_bc_matrix.h5",
  "Patient2" = "data/GSM5687769_SZ16raw_feature_bc_matrix.h5",
  "Patient3" = "data/GSM5687772_SZ30raw_feature_bc_matrix.h5"
)

# Read and create individual Seurat objects
seurat_list <- list()

3. Load each patient sample with metadata


seurat_list <- list()

for (patient in names(h5_files)) {
  raw_data <- Read10X_h5(h5_files[[patient]])

  # If raw_data is a list (multi-modal), extract "Gene Expression"
  if (is.list(raw_data)) {
    if (!"Gene Expression" %in% names(raw_data)) {
      stop(paste("Gene Expression modality not found in patient", patient))
    }
    raw_counts <- raw_data[["Gene Expression"]]
  } else {
    # Single modality
    raw_counts <- raw_data
  }

  seu <- CreateSeuratObject(
    counts = raw_counts,
    min.cells = 3,
    min.features = 200,
    project = patient
  )

  seu$patient_id <- patient
  seurat_list[[patient]] <- seu
}
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome

4. Merge All Patients into One Seurat Object

# Step 8: JackStraw (optional but useful for PC significance)
combined_seu <- JackStraw(combined_seu, num.replicate = 100)
Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~11m 24s      
  |+                                                 | 2 % ~11m 48s      
  |++                                                | 3 % ~11m 26s      
  |++                                                | 4 % ~11m 16s      
  |+++                                               | 5 % ~11m 06s      
  |+++                                               | 6 % ~11m 08s      
  |++++                                              | 7 % ~10m 58s      
  |++++                                              | 8 % ~10m 49s      
  |+++++                                             | 9 % ~10m 40s      

5. QC


# Calculate percent.mt for each cell
combined_seu[["percent.mt"]] <- PercentageFeatureSet(combined_seu, pattern = "^MT-")

# Filter cells with too few or too many genes
combined_seu <- subset(combined_seu, 
                       subset = nFeature_RNA > 200 & 
                                nFeature_RNA < 8000 & 
                                percent.mt < 10)


VlnPlot(combined_seu, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)


VlnPlot(combined_seu, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))
Warning: The following requested variables were not found: percent.rb

FeatureScatter(combined_seu, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')

##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.


FeatureScatter(combined_seu, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')


FeatureScatter(combined_seu, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

6. PCA Visualization




# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- combined_seu[["pca"]]@stdev / sum(combined_seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 15
# TEST-2
# get significant PCs
stdv <- combined_seu[["pca"]]@stdev
sum.stdv <- sum(combined_seu[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 15
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

NA
NA
NA

8. Clustering (resolution = 1.2)



combined_seu <- FindNeighbors(combined_seu, dims = 1:15)
Computing nearest neighbor graph
Computing SNN
combined_seu <- FindClusters(combined_seu, resolution = 0.3)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 29558
Number of edges: 1047845

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9523
Number of communities: 21
Elapsed time: 3 seconds
# Run UMAP
combined_seu <- RunUMAP(combined_seu, dims = 1:15)
15:04:53 UMAP embedding parameters a = 0.9922 b = 1.112
15:04:53 Read 29558 rows and found 15 numeric columns
15:04:53 Using Annoy for neighbor search, n_neighbors = 30
15:04:53 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:04:55 Writing NN index file to temp file /tmp/Rtmp4VyJCk/file908877941321
15:04:55 Searching Annoy index using 1 thread, search_k = 3000
15:05:02 Annoy recall = 100%
15:05:03 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:05:04 Initializing from normalized Laplacian + noise (using RSpectra)
15:05:06 Commencing optimization for 200 epochs, with 1265872 positive edges
15:05:06 Using rng type: pcg
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:05:15 Optimization finished
# UMAP plot colored by clusters
DimPlot(combined_seu, reduction = "umap", label = TRUE, pt.size = 0.5) +
  ggtitle("UMAP of CD4+ T Cells (Resolution = 1.2)")

9. Visualize UMAP with Clusters


DimPlot(combined_seu, reduction = "umap", group.by = "patient_id", label = TRUE, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")


DimPlot(combined_seu, reduction = "umap", label = TRUE,label.box = T,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")

NA
NA
NA

10. FeaturePlots for Top50 UP

top_50_up <- read.csv("top_50_upregulated.csv")        # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("top_50_downregulated.csv")



FeaturePlot(combined_seu, 
             features = top_50_up$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)



FeaturePlot(combined_seu, 
             features = top_50_up$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(combined_seu, 
             features = top_50_up$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(combined_seu, 
             features = top_50_up$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(combined_seu, 
             features = top_50_up$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

11. FeaturePlots for Top50 DOWN


FeaturePlot(combined_seu, 
             features = top_50_down$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)



FeaturePlot(combined_seu, 
             features = top_50_down$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(combined_seu, 
             features = top_50_down$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(combined_seu, 
             features = top_50_down$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(combined_seu, 
             features = top_50_down$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

Visualization



DimPlot(combined_seu, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")

NA
NA

Visualization of Potential biomarkers-Upregulated



# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(combined_seu, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Visualization of Potential biomarkers-Downregulated



# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(combined_seu, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Save the Seurat object as an RDS



saveRDS(combined_seu, file = "data/ss_Gaydosik.rds")
---
title: "Potential biomarkers Validation on Public Patient data_Gaydosik_3_Malignant_Patient_Sample"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r , include=FALSE}

library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(Azimuth)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(rmarkdown)
library(tinytex)


library(dplyr)
library(dittoSeq)
library(ggrepel)
#library(ggtree)
library(parallel)
library(plotly)  # 3D plot
library(Seurat)  # Idents()
library(SeuratDisk)  # SaveH5Seurat()
library(tibble)  # rownnames_to_column
library(harmony) # RunHarmony()
#options(mc.cores = detectCores() - 1)

library(dplyr)
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(magrittr)
library(dbplyr)
library(rmarkdown)
library(knitr)
library(tinytex)
#Azimuth Annotation libraries
library(Azimuth)
#ProjecTils Annotation libraries
library(STACAS)
library(ProjecTILs)
#singleR Annotation libraries

library(SingleCellExperiment)

```


# 2. Load Data into Seurat
```{r }

# Patient IDs and H5 files (replace with your actual file paths)

# Define paths to your H5 files
h5_files <- list(
  "Patient1" = "data/GSM5687767_SZ22raw_feature_bc_matrix.h5",
  "Patient2" = "data/GSM5687769_SZ16raw_feature_bc_matrix.h5",
  "Patient3" = "data/GSM5687772_SZ30raw_feature_bc_matrix.h5"
)

# Read and create individual Seurat objects
seurat_list <- list()

```

# 3. Load each patient sample with metadata
```{r , fig.height=6, fig.width=10}

seurat_list <- list()

for (patient in names(h5_files)) {
  raw_data <- Read10X_h5(h5_files[[patient]])

  # If raw_data is a list (multi-modal), extract "Gene Expression"
  if (is.list(raw_data)) {
    if (!"Gene Expression" %in% names(raw_data)) {
      stop(paste("Gene Expression modality not found in patient", patient))
    }
    raw_counts <- raw_data[["Gene Expression"]]
  } else {
    # Single modality
    raw_counts <- raw_data
  }

  seu <- CreateSeuratObject(
    counts = raw_counts,
    min.cells = 3,
    min.features = 200,
    project = patient
  )

  seu$patient_id <- patient
  seurat_list[[patient]] <- seu
}



```

# 4. Merge All Patients into One Seurat Object
```{r }

# Step 2: Merge into combined Seurat object
combined_seu <- merge(
  seurat_list[[1]],
  y = seurat_list[-1],
  add.cell.ids = names(seurat_list),
  project = "SS_Merged"
)

# Step 3: Normalization (LogNormalize)
combined_seu <- NormalizeData(combined_seu, normalization.method = "LogNormalize", scale.factor = 10000)

# Step 4: Variable features selection
combined_seu <- FindVariableFeatures(combined_seu, selection.method = "vst", nfeatures = 2000)

# Step 5: Scaling
combined_seu <- ScaleData(combined_seu)

# Step 6: PCA
combined_seu <- RunPCA(combined_seu, features = VariableFeatures(combined_seu))

# Step 7: Elbow Plot to decide number of PCs
ElbowPlot(combined_seu)


# # Step 8: JackStraw (optional but useful for PC significance)
# combined_seu <- JackStraw(combined_seu, num.replicate = 100)
# combined_seu <- ScoreJackStraw(combined_seu, dims = 1:20)
# JackStrawPlot(combined_seu, dims = 1:20)


```



# 5. QC
```{r , fig.height=6, fig.width=10}

# Calculate percent.mt for each cell
combined_seu[["percent.mt"]] <- PercentageFeatureSet(combined_seu, pattern = "^MT-")

# Filter cells with too few or too many genes
combined_seu <- subset(combined_seu, 
                       subset = nFeature_RNA > 200 & 
                                nFeature_RNA < 8000 & 
                                percent.mt < 10)


VlnPlot(combined_seu, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)

VlnPlot(combined_seu, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))

FeatureScatter(combined_seu, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')

```

##FeatureScatter is typically used to visualize feature-feature relationships
##for anything calculated by the object, 
##i.e. columns in object metadata, PC scores etc.

```{r , fig.height=6, fig.width=10}

FeatureScatter(combined_seu, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')

FeatureScatter(combined_seu, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

```

# 6. PCA Visualization
```{r PCA-TEST2, fig.height=6, fig.width=10}



# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- combined_seu[["pca"]]@stdev / sum(combined_seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- combined_seu[["pca"]]@stdev
sum.stdv <- sum(combined_seu[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

```

# 8. Clustering (resolution = 1.2)
```{r , fig.height=6, fig.width=10}


combined_seu <- FindNeighbors(combined_seu, dims = 1:15)
combined_seu <- FindClusters(combined_seu, resolution = 0.3)

# Run UMAP
combined_seu <- RunUMAP(combined_seu, dims = 1:15)

# UMAP plot colored by clusters
DimPlot(combined_seu, reduction = "umap", label = TRUE, pt.size = 0.5) +
  ggtitle("UMAP of CD4+ T Cells (Resolution = 1.2)")

```
# 9. Visualize UMAP with Clusters
```{r , fig.height=6, fig.width=10}

DimPlot(combined_seu, reduction = "umap", group.by = "patient_id", label = TRUE, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")

DimPlot(combined_seu, reduction = "umap", label = TRUE,label.box = T,repel = T, pt.size = 0.6) +
  ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")



```

# 10.  FeaturePlots for Top50 UP
```{r , fig.height=16, fig.width=20}
top_50_up <- read.csv("top_50_upregulated.csv")        # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("top_50_downregulated.csv")



FeaturePlot(combined_seu, 
             features = top_50_up$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(combined_seu, 
             features = top_50_up$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(combined_seu, 
             features = top_50_up$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(combined_seu, 
             features = top_50_up$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(combined_seu, 
             features = top_50_up$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

```



# 11.  FeaturePlots for Top50 DOWN
```{r , fig.height=16, fig.width=20}

FeaturePlot(combined_seu, 
             features = top_50_down$gene[1:10], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)


FeaturePlot(combined_seu, 
             features = top_50_down$gene[11:20], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)

FeaturePlot(combined_seu, 
             features = top_50_down$gene[21:30], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(combined_seu, 
             features = top_50_down$gene[31:40], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
FeaturePlot(combined_seu, 
             features = top_50_down$gene[41:50], 
             reduction = "umap", 
             cols = c("lightblue", "red"),  # Custom color gradient from light blue to red
             label = TRUE)
```

## Visualization
```{r , fig.height=6, fig.width=10}


DimPlot(combined_seu, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")


```

## Visualization of Potential biomarkers-Upregulated
```{r , fig.height=6, fig.width=10}


# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")

# DotPlot with custom firebrick-red gradient
DotPlot(combined_seu, features = up_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )



```



## Visualization of Potential biomarkers-Downregulated
```{r C2, fig.height=6, fig.width=10}


# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2", 
                "ZFP36", "ZFP36L1", "ZFP36L2",
                "PRMT2", "MAX", "PIK3IP1", 
                "BTG1", "CDKN1B")

# DotPlot with firebrick color for high expression
DotPlot(combined_seu, features = down_genes) +
  RotatedAxis() +
  scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) +
  ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )



```

# Save the Seurat object as an RDS
```{r saveRDS}


saveRDS(combined_seu, file = "data/ss_Gaydosik.rds")


```



