1. load libraries
Load your objects
combined_merged <- readRDS("Poglio_Gaydosik_combined_merged_QCfiltered.rds")
2. QC
# Remove the percent.mito column
combined_merged$percent.mito <- NULL
# Set identity classes to an existing column in meta data
Idents(object = combined_merged) <- "batch"
# Add percent ribosomal and mitochondrial content
combined_merged[["percent.rb"]] <- PercentageFeatureSet(combined_merged, pattern = "^RP[SL]")
combined_merged$percent.rb <- replace(as.numeric(combined_merged$percent.rb), is.na(combined_merged$percent.rb), 0)
combined_merged[["percent.mt"]] <- PercentageFeatureSet(combined_merged, pattern = "^MT-")
combined_merged$percent.mt <- replace(as.numeric(combined_merged$percent.mt), is.na(combined_merged$percent.mt), 0)
# ----------------------------
# Filter high `nCount_RNA` cells
# ----------------------------
# Define an upper threshold (e.g., top 0.5% or absolute cutoff)
ncount_thresh <- quantile(combined_merged$nCount_RNA, 0.995) # top 0.5% outliers
message("Removing cells with nCount_RNA > ", round(ncount_thresh))
# Subset the object to remove both types of outliers
combined_merged <- subset(combined_merged,
subset = nCount_RNA < ncount_thresh & percent.mt < 10)
# ----------------------------
# QC Plots (after filtering)
# ----------------------------
VlnPlot(combined_merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10))
FeatureScatter(combined_merged, feature1 = "percent.mt", feature2 = "percent.rb")
FeatureScatter(combined_merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
##FeatureScatter is typically used to visualize feature-feature
relationships ##for anything calculated by the object, ##i.e. columns in
object metadata, PC scores etc.
FeatureScatter(combined_merged,
feature1 = "nCount_RNA",
feature2 = "percent.mt")+
geom_smooth(method = 'lm')
FeatureScatter(combined_merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
JoinLayers
Assign Cell-Cycle Scores
3. Normalize data
# Apply SCTransform
combined_merged <- SCTransform(combined_merged,
vars.to.regress = c("percent.rb","percent.mt","CC.Difference", "nCount_RNA"),
assay = "RNA",
do.scale=TRUE,
do.center=TRUE,
verbose = TRUE)
6. Clustering
# Set the seed for clustering steps
set.seed(123)
combined_merged <- FindNeighbors(combined_merged,
dims = 1:min.pc,
verbose = FALSE)
# understanding resolution
combined_merged <- FindClusters(combined_merged,
resolution = c(0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1, 1.2,1.5))
UMAP Visualization
# Set the seed for clustering steps
set.seed(123)
# non-linear dimensionality reduction --------------
combined_merged <- RunUMAP(combined_merged,
dims = 1:min.pc,
verbose = FALSE)
# Define resolution values to plot
resolutions <- c(0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1, 1.2,1.5)
# Loop through and generate DimPlots
for (res in resolutions) {
res_col <- paste0("SCT_snn_res.", res)
p <- DimPlot(combined_merged,
group.by = res_col,
reduction = "umap",
label.size = 3,
repel = TRUE,
label = TRUE,
label.box = TRUE) +
ggtitle(paste("Clustering resolution:", res))
print(p)
}
# Set identity classes to an existing column in meta data
Idents(object = combined_merged) <- "SCT_snn_res.0.4"
cluster_table <- table(Idents(combined_merged))
barplot(cluster_table, main = "Number of Cells in Each Cluster",
xlab = "Cluster",
ylab = "Number of Cells",
col = rainbow(length(cluster_table)))
print(cluster_table)
Save the Seurat object as an Robj file
saveRDS(combined_merged, file = "Poglio_Gaydosik_combined_merged_SCT_Normalized.rds")
7. clusTree
library(clustree)
clustree(combined_merged, prefix = "SCT_snn_res.")
8. Azimuth Annotation
InstallData("pbmcref")
# The RunAzimuth function can take a Seurat object as input
combined_merged <- RunAzimuth(combined_merged, reference = "pbmcref")
9. Azimuth Visualization
DimPlot(combined_merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(combined_merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(combined_merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(combined_merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(combined_merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
table(combined_merged$predicted.celltype.l2, combined_merged$SCT_snn_res.0.4)
10. Harmony Integration
# Load required libraries
library(Seurat)
library(harmony)
library(ggplot2)
# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
combined_merged <- RunHarmony(
combined_merged,
group.by.vars = c("batch"), # Replace with the metadata column specifying batch or cell line
assay.use="SCT")
# Check results in harmony embeddings
harmony_embeddings <- Embeddings(combined_merged, reduction = "harmony")
head(harmony_embeddings)
# Set the seed for clustering steps
set.seed(123)
# Run UMAP on Harmony embeddings
combined_merged <- RunUMAP(combined_merged, reduction = "harmony", dims = 1:16)
# Set the seed for clustering steps
set.seed(123)
# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
combined_merged <- FindNeighbors(combined_merged, reduction = "harmony", dims = 1:16)
combined_merged <- FindClusters(combined_merged, reduction = "harmony", resolution = 0.5) # Adjust resolution as needed
# Visualize UMAP
DimPlot(combined_merged, reduction = "umap", group.by = "batch", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP of Harmony-Integrated Data")
# Visualize UMAP with batch/cell line information
DimPlot(combined_merged, reduction = "umap", group.by = "batch", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")
# Visualize UMAP with clusters
DimPlot(combined_merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Clustered Data (After Harmony Integration)")
# # Visualize specific cell types or other metadata
# DimPlot(combined_merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
# ggtitle("UMAP - Cell Types After Harmony Integration")
Save the Seurat object as an Robj file
saveRDS(combined_merged, file = "Poglio_Gaydosik_combined_merged_Harmonized.rds")
---
title: "Merging Poglio and Gaydosik Data"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true

---

# 1. load libraries
```{r , include=FALSE}

# At the top of your script, combine packages like this:
library(tidyverse)      # includes dplyr, ggplot2, tibble, readr, etc.
library(Seurat)
library(harmony)
library(dittoSeq)
library(plotly)
library(SeuratDisk)
library(patchwork)
library(rmarkdown)
library(knitr)
library(tinytex)
library(Azimuth)
library(STACAS)
library(ProjecTILs)
library(SingleCellExperiment)

```


## Load your objects
```{r , fig.height=4, fig.width=6}

combined_merged <- readRDS("Poglio_Gaydosik_combined_merged_QCfiltered.rds")
```

# 2. QC
```{r QC, fig.height=8, fig.width=14}

# Remove the percent.mito column
combined_merged$percent.mito <- NULL


# Set identity classes to an existing column in meta data
Idents(object = combined_merged) <- "batch"

# Add percent ribosomal and mitochondrial content
combined_merged[["percent.rb"]] <- PercentageFeatureSet(combined_merged, pattern = "^RP[SL]")
combined_merged$percent.rb <- replace(as.numeric(combined_merged$percent.rb), is.na(combined_merged$percent.rb), 0)

combined_merged[["percent.mt"]] <- PercentageFeatureSet(combined_merged, pattern = "^MT-")
combined_merged$percent.mt <- replace(as.numeric(combined_merged$percent.mt), is.na(combined_merged$percent.mt), 0)

# ----------------------------
# Filter high `nCount_RNA` cells
# ----------------------------

# Define an upper threshold (e.g., top 0.5% or absolute cutoff)
ncount_thresh <- quantile(combined_merged$nCount_RNA, 0.995)  # top 0.5% outliers
message("Removing cells with nCount_RNA > ", round(ncount_thresh))

# Subset the object to remove both types of outliers
combined_merged <- subset(combined_merged, 
                          subset = nCount_RNA < ncount_thresh & percent.mt < 10)

# ----------------------------
# QC Plots (after filtering)
# ----------------------------

VlnPlot(combined_merged, features = c("nFeature_RNA", 
                                      "nCount_RNA", 
                                      "percent.mt",
                                      "percent.rb"), 
        ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10))

FeatureScatter(combined_merged, feature1 = "percent.mt", feature2 = "percent.rb")

FeatureScatter(combined_merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')

```

##FeatureScatter is typically used to visualize feature-feature relationships
##for anything calculated by the object, 
##i.e. columns in object metadata, PC scores etc.

```{r , fig.height=6, fig.width=10}

FeatureScatter(combined_merged, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')

FeatureScatter(combined_merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

```

##  JoinLayers
```{r , echo=FALSE, fig.height=6, fig.width=10}


DefaultAssay(combined_merged) <- "RNA"

# Merge all counts.* layers into a single counts layer
combined_merged <- JoinLayers(combined_merged, assay = "RNA")

# Confirm layers now
Layers(combined_merged[["RNA"]])
```

##  Assign Cell-Cycle Scores
```{r , echo=FALSE, fig.height=6, fig.width=10}

options(future.globals.maxSize = 8000 * 1024^2)  # Set to 8000 MiB (about 8 GB)


combined_merged <- SCTransform(combined_merged, 
                                   do.scale = FALSE, 
                                   do.center = FALSE)  # Reduce to 1000 variable features


# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes


combined_merged <- CellCycleScoring(combined_merged, 
                                       s.features = s.genes, 
                                       g2m.features = g2m.genes, 
                                       set.ident = TRUE)

DefaultAssay(combined_merged) <- "RNA"
combined_merged$CC.Difference <- combined_merged$S.Score - combined_merged$G2M.Score

```


# 3. Normalize data
```{r SCTNormalize, include=TRUE}
# Apply SCTransform
combined_merged <- SCTransform(combined_merged, 
                                  vars.to.regress = c("percent.rb","percent.mt","CC.Difference", "nCount_RNA"),
                                  assay = "RNA", 
                                  do.scale=TRUE, 
                                  do.center=TRUE, 
                                  verbose = TRUE)
```


# 4. Perform PCA
```{r PCA, fig.height=6, fig.width=10}

Variables_genes <- combined_merged@assays$SCT@var.features

# Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]

# Set the seed for clustering steps
set.seed(123)

# These are now standard steps in the Seurat workflow for visualization and clustering
combined_merged <- RunPCA(combined_merged,
                        features = Variables_genes_after_exclusion,
                        do.print = TRUE, 
                        pcs.print = 1:5, 
                        genes.print = 15,
                        npcs = 50)

# determine dimensionality of the data
ElbowPlot(combined_merged, ndims = 50)
```

# 5. Perform PCA TEST
```{r PCA-TEST, fig.height=6, fig.width=10}


library(ggplot2)
library(RColorBrewer)  

# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")

# Assuming combined_merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(combined_merged$cell_line))
colnames(data) <- c("cell_line", "nUMI")  # Change column name to nUMI

ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nUMI), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25) +
  scale_fill_manual(values = cell_line_colors) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +  # Adjust the title position
  ggtitle("Filtered cells per sample") +
  xlab("Cell lines") +  # Adjust x-axis label
  ylab("Frequency")    # Adjust y-axis label

print(ncells)



# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- combined_merged[["pca"]]@stdev / sum(combined_merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- combined_merged[["pca"]]@stdev
sum.stdv <- sum(combined_merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

```

# 6. Clustering
```{r C1, fig.height=6, fig.width=10}

# Set the seed for clustering steps
set.seed(123)

combined_merged <- FindNeighbors(combined_merged, 
                                dims = 1:min.pc, 
                                verbose = FALSE)

# understanding resolution
combined_merged <- FindClusters(combined_merged, 
                                    resolution = c(0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1, 1.2,1.5))


```

## UMAP Visualization
```{r C2, fig.height=6, fig.width=10}
# Set the seed for clustering steps
set.seed(123)

# non-linear dimensionality reduction --------------
combined_merged <- RunUMAP(combined_merged, 
                          dims = 1:min.pc,
                          verbose = FALSE)
                                  

# Define resolution values to plot
resolutions <- c(0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1, 1.2,1.5)

# Loop through and generate DimPlots
for (res in resolutions) {
  res_col <- paste0("SCT_snn_res.", res)
  
  p <- DimPlot(combined_merged,
               group.by = res_col,
               reduction = "umap",
               label.size = 3,
               repel = TRUE,
               label = TRUE,
               label.box = TRUE) +
       ggtitle(paste("Clustering resolution:", res))
  
  print(p)
}


# Set identity classes to an existing column in meta data
Idents(object = combined_merged) <- "SCT_snn_res.0.4"

cluster_table <- table(Idents(combined_merged))


barplot(cluster_table, main = "Number of Cells in Each Cluster", 
                      xlab = "Cluster", 
                      ylab = "Number of Cells", 
                      col = rainbow(length(cluster_table)))

print(cluster_table)

```


## Save the Seurat object as an Robj file
```{r, echo=TRUE}

saveRDS(combined_merged, file = "Poglio_Gaydosik_combined_merged_SCT_Normalized.rds")

```

# 7. clusTree
```{r clusTree, fig.height=12, fig.width=10}
library(clustree)
clustree(combined_merged, prefix = "SCT_snn_res.")
```

# 8. Azimuth Annotation
```{r azimuth_Annotation2, fig.height=6, fig.width=10}
InstallData("pbmcref")

# The RunAzimuth function can take a Seurat object as input
combined_merged <- RunAzimuth(combined_merged, reference = "pbmcref")

```

# 9. Azimuth Visualization
```{r azimuth_Visualization, fig.height=6, fig.width=10}
DimPlot(combined_merged, group.by = "predicted.celltype.l1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(combined_merged, group.by = "predicted.celltype.l1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(combined_merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(combined_merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)


DimPlot(combined_merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)



table(combined_merged$predicted.celltype.l2, combined_merged$SCT_snn_res.0.4)

```


# 10. Harmony Integration
```{r harmony, fig.height=6, fig.width=10}


# Load required libraries
library(Seurat)
library(harmony)
library(ggplot2)

# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
combined_merged <- RunHarmony(
  combined_merged,
  group.by.vars = c("batch"),  # Replace with the metadata column specifying batch or cell line
  assay.use="SCT")

# Check results in harmony embeddings
harmony_embeddings <- Embeddings(combined_merged, reduction = "harmony")
head(harmony_embeddings)

# Set the seed for clustering steps
set.seed(123)

# Run UMAP on Harmony embeddings
combined_merged <- RunUMAP(combined_merged, reduction = "harmony", dims = 1:16)

# Set the seed for clustering steps
set.seed(123)

# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
combined_merged <- FindNeighbors(combined_merged, reduction = "harmony", dims = 1:16)
combined_merged <- FindClusters(combined_merged, reduction = "harmony", resolution = 0.5)  # Adjust resolution as needed

# Visualize UMAP
DimPlot(combined_merged, reduction = "umap", group.by = "batch", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP of Harmony-Integrated Data")


# Visualize UMAP with batch/cell line information
DimPlot(combined_merged, reduction = "umap", group.by = "batch", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")


# Visualize UMAP with clusters
DimPlot(combined_merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Clustered Data (After Harmony Integration)")

# # Visualize specific cell types or other metadata
# DimPlot(combined_merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
#     ggtitle("UMAP - Cell Types After Harmony Integration")



```


## Save the Seurat object as an Robj file
```{r, echo=TRUE}

saveRDS(combined_merged, file = "Poglio_Gaydosik_combined_merged_Harmonized.rds")

```




