1. load libraries
2. Load Seurat Object
#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
SS_All_samples_Merged <- load("SS_All_Sample_Merged_Azimuth_ProjectTils_singleR_ANNOTATION_on_My_UMAP0.7_HPC.Robj")
All_samples_Merged
An object of class Seurat
63193 features across 49193 samples within 6 assays
Active assay: SCT (26469 features, 3000 variable features)
3 layers present: counts, data, scale.data
5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
4 dimensional reductions calculated: pca, umap, integrated_dr, ref.umap
3. QC
Idents(All_samples_Merged) <- "cell_line"
VlnPlot(All_samples_Merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mito"),
ncol = 3)

FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')

##FeatureScatter is typically used to visualize feature-feature
relationships ##for anything calculated by the object, ##i.e. columns in
object metadata, PC scores etc.
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "percent.mito")+
geom_smooth(method = 'lm')

FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')

4. Normalize data
# # Apply SCTransform
# All_samples_Merged <- SCTransform(All_samples_Merged, verbose = TRUE)
#
5. Perform PCA TEST
library(ggplot2)
library(RColorBrewer)
# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")
# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI") # Change column name to nUMI
ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) +
geom_col() +
theme_classic() +
geom_text(aes(label = nUMI),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_fill_manual(values = cell_line_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) + # Adjust the title position
ggtitle("Filtered cells per sample") +
xlab("Cell lines") + # Adjust x-axis label
ylab("Frequency") # Adjust y-axis label
print(ncells)

cell_lines <- FindVariableFeatures(All_samples_Merged,
selection.method = "vst", # default vst
nfeatures = 2000, # default 2000
verbose = FALSE)
# view top variable genes
top40 <- head(VariableFeatures(cell_lines), 40)
top40
[1] "CCL17" "CXCL8" "CXCL5" "THBS1" "S100A8" "TYROBP" "S100A9" "CCL1" "CXCL1"
[10] "IL1B" "SERPINB2" "PID1" "CCL4" "FTH1" "CXCL3" "MMP9" "SOD2" "CD14"
[19] "C15orf48" "CCL3" "EREG" "IGKV3-20" "FCER1G" "LYZ" "CCL22" "GNLY" "PPBP"
[28] "OASL" "IGKV3-11" "IGLV2-14" "CXCL2" "DOCK4" "IFIT2" "CD79A" "VMO1" "IGKV1-5"
[37] "XCL1" "GZMB" "SPI1" "IGHV4-34"
# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(cell_lines, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot,
points = top40, repel = TRUE, fontface="italic",
xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel

df <- data.frame(row.names = rownames(cell_lines))
df$rsum <- rowSums(x = cell_lines, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 10)
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 20
# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 20
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()

NA
NA
6. Clustering
# run umap
#celllines.nointergration <- RunUMAP(All_samples_Merged, dims = 1:min.pc, reduction = "pca")
# cluster
#celllines.nointergration <- FindNeighbors(object = celllines.nointergration, dims = 1:min.pc)
# Determine the clusters for various resolutions
#celllines.nointergration <- FindClusters(object = celllines.nointergration, resolution = seq(0.1,1.2,by=0.1))
Idents(celllines.nointergration) <- "SCT_snn_res.0.7"
celllines.nointergration$seurat_clusters <- celllines.nointergration$SCT_snn_res.0.7
u1 <- DimPlot(object = celllines.nointergration, group.by = "seurat_clusters", label = TRUE, label.box = TRUE)
u1

u2 <- DimPlot(object = celllines.nointergration,
group.by = "seurat_clusters",
reduction = "umap",
label = TRUE,
label.box = TRUE,
split.by = "cell_line")
u2

u3 <- DimPlot(object = celllines.nointergration,
group.by = "seurat_clusters",
reduction = "umap",
label = TRUE,
label.box = TRUE,
split.by = "Patient_origin")
u3

u4 <- DimPlot(object = celllines.nointergration,
group.by = "seurat_clusters",
reduction = "umap",
label = TRUE,
label.box = TRUE,
split.by = "Treatments_analysis")
u4

u5 <- dittoDimPlot(object = celllines.nointergration,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
labels.highlight = FALSE)
u5

u6 <- dittoDimPlot(object = celllines.nointergration,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
split.by = "Treatments_analysis",
labels.highlight = FALSE)
u6

#Integrate with harmony
# Run harmony to harmonize over samples
# celllines.intergration <- RunHarmony(
# celllines.nointergration,
# group.by.vars = "orig.ident",
# assay.use = "SCT",
# reduction.use = "pca",
# plot_convergence = TRUE)
#Find significant PCs
#First metric
# Determine percent of variation associated with each PC
stdv <- celllines.intergration[["pca"]]@stdev
sum.stdv <- sum(celllines.intergration[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)
# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
[1] 41
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
(percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
[1] 20
# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
[1] 20
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()

# Run UMAP
# celllines.intergration <- RunUMAP(celllines.intergration,
# dims = 1:min.pc,
# reduction.use = "pca",
# n.components = 3) # set to 3 to use with VR
# Determine the K-nearest neighbor graph
#celllines.intergration <- FindNeighbors(object = celllines.intergration,dims = 1:min.pc)
# Determine the clusters for various resolutions
#cell_line.unannotated <- FindClusters(object = celllines.intergration,resolution = seq(0.1,1.2,by=0.1))
# Neurons after re-cluster
# UMAP
DefaultAssay(cell_line.unannotated) <- "SCT"
#neuron.unannotated <- NormalizeData(neuron.unannotated, verbose = FALSE)
cell_line.unannotated$seurat_clusters <- cell_line.unannotated$SCT_snn_res.0.7
u1 <- dittoDimPlot(object = cell_line.unannotated,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
labels.highlight = FALSE)
u1

u2 <- dittoDimPlot(object = cell_line.unannotated,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
split.by = "Treatments_analysis",
labels.highlight = FALSE)
u2

NA
NA
# #Cluster tree
# cluster_colors <- c("gold","firebrick1","dodgerblue","green",
# "cyan","chocolate4","gray40","purple", "blue")
#
# celllines.nointergration <- BuildClusterTree(object = celllines.nointergration,
# dims = 1:min.pc,
# reorder = FALSE,
# reorder.numeric = FALSE)
#
# tree <- celllines.nointergration@tools$BuildClusterTree
# tree$tip.label <- paste0("Cluster ", tree$tip.label)
#
# p <- ggtree::ggtree(tree, aes(x, y)) +
# scale_y_reverse() +
# ggtree::geom_tree() +
# ggtree::theme_tree() +
# ggtree::geom_tiplab(offset = 1) +
# ggtree::geom_tippoint(color = cluster_colors[1:length(tree$tip.label)], shape = 16, size = 5) +
# coord_cartesian(clip = 'off') +
# theme(plot.margin = unit(c(0,2.5,0,0), 'cm'))
Save the Seurat object as an Robj file———————————————————————-
```
---
title: "PC test qnd harmony integration on SCT"
author: Nasir Mahmood Abbasi
date: "2024-05-12"
output:
  html_notebook: 
    toc: true
    toc_float: true
    toc_collapsed: true
    theme: darkly
---

# 1. load libraries
```{r setup, include=FALSE}

library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(Azimuth)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(rmarkdown)
library(tinytex)


library(dplyr)
library(dittoSeq)
library(ggrepel)
#library(ggtree)
library(parallel)
library(plotly)  # 3D plot
library(Seurat)  # Idents()
library(SeuratDisk)  # SaveH5Seurat()
library(tibble)  # rownnames_to_column
library(harmony) # RunHarmony()
#options(mc.cores = detectCores() - 1)



```


# 2. Load Seurat Object 
```{r load_seurat}

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
SS_All_samples_Merged <- load("SS_All_Sample_Merged_Azimuth_ProjectTils_singleR_ANNOTATION_on_My_UMAP0.7_HPC.Robj")

All_samples_Merged
```


# 3. QC
```{r QC, fig.height=6, fig.width=10}

Idents(All_samples_Merged) <- "cell_line"
VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mito"), 
                                      ncol = 3)

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')

```

##FeatureScatter is typically used to visualize feature-feature relationships
##for anything calculated by the object, 
##i.e. columns in object metadata, PC scores etc.

```{r FC, fig.height=6, fig.width=10}

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mito")+
  geom_smooth(method = 'lm')

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

```



# 4. Normalize data
```{r Normalize, fig.height=4, fig.width=6}


# # Apply SCTransform
# All_samples_Merged <- SCTransform(All_samples_Merged, verbose = TRUE)
#                                       
```


# 5. Perform PCA
```{r PCA, fig.height=6, fig.width=10}

# Variables_genes <- All_samples_Merged@assays$SCT@var.features
# 
# # Exclude genes starting with "HLA-" or "Xist"
# Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^Xist", Variables_genes)]
# 
# 
# # These are now standard steps in the Seurat workflow for visualization and clustering
# All_samples_Merged <- RunPCA(All_samples_Merged,
#                         features = Variables_genes_after_exclusion,
#                         do.print = TRUE, 
#                         pcs.print = 1:5, 
#                         genes.print = 15)

# determine dimensionality of the data
ElbowPlot(All_samples_Merged)


```


# 5. Perform PCA TEST
```{r PCA-TEST, fig.height=6, fig.width=10}


library(ggplot2)
library(RColorBrewer)  

# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")

# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI")  # Change column name to nUMI

ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nUMI), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25) +
  scale_fill_manual(values = cell_line_colors) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +  # Adjust the title position
  ggtitle("Filtered cells per sample") +
  xlab("Cell lines") +  # Adjust x-axis label
  ylab("Frequency")    # Adjust y-axis label

print(ncells)


cell_lines <- FindVariableFeatures(All_samples_Merged,
                                   selection.method = "vst",  # default vst
                                   nfeatures = 2000,  # default 2000
                                   verbose = FALSE)

# view top variable genes
top40 <- head(VariableFeatures(cell_lines), 40)
top40



# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(cell_lines, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot, 
                    points = top40, repel = TRUE, fontface="italic", 
                    xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel


df <- data.frame(row.names = rownames(cell_lines))
df$rsum <- rowSums(x = cell_lines, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 10)

# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()


```


# 6. Clustering
```{r C1, fig.height=6, fig.width=10}

# run umap
#celllines.nointergration <- RunUMAP(All_samples_Merged, dims = 1:min.pc, reduction = "pca")


# cluster
#celllines.nointergration <- FindNeighbors(object = celllines.nointergration, dims = 1:min.pc)   



# Determine the clusters for various resolutions                                
#celllines.nointergration <- FindClusters(object = celllines.nointergration, resolution = seq(0.1,1.2,by=0.1))



Idents(celllines.nointergration) <- "SCT_snn_res.0.7"
celllines.nointergration$seurat_clusters <- celllines.nointergration$SCT_snn_res.0.7



u1 <- DimPlot(object = celllines.nointergration, group.by = "seurat_clusters", label = TRUE, label.box = TRUE)

u1


u2 <- DimPlot(object = celllines.nointergration, 
         group.by = "seurat_clusters", 
         reduction = "umap", 
         label = TRUE, 
         label.box = TRUE, 
         split.by = "cell_line")

u2


u3 <- DimPlot(object = celllines.nointergration, 
         group.by = "seurat_clusters", 
         reduction = "umap", 
         label = TRUE, 
         label.box = TRUE, 
         split.by = "Patient_origin")

u3


u4 <- DimPlot(object = celllines.nointergration, 
         group.by = "seurat_clusters", 
         reduction = "umap", 
         label = TRUE, 
         label.box = TRUE, 
         split.by = "Treatments_analysis")

u4


u5 <- dittoDimPlot(object = celllines.nointergration,
             var = "seurat_clusters",
             reduction.use = "umap",
             do.label = TRUE,
             labels.highlight = FALSE)
u5



u6 <- dittoDimPlot(object = celllines.nointergration,
             var = "seurat_clusters",
             reduction.use = "umap",
             do.label = TRUE,
             split.by = "Treatments_analysis",
             labels.highlight = FALSE)
u6





#Integrate with harmony
# Run harmony to harmonize over samples 
# celllines.intergration <- RunHarmony(
#                               celllines.nointergration, 
#                               group.by.vars = "orig.ident", 
#                               assay.use = "SCT",
#                               reduction.use = "pca", 
#                               plot_convergence = TRUE)






#Find significant PCs
#First metric

# Determine percent of variation associated with each PC
stdv <- celllines.intergration[["pca"]]@stdev
sum.stdv <- sum(celllines.intergration[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100

# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)

# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1


# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
  (percent.stdv[1:length(percent.stdv) - 1] - 
     percent.stdv[2:length(percent.stdv)]) > 0.1), 
  decreasing = T)[1] + 1

# last point where change of % of variation is more than 0.1%.
co2

# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc


 
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

# Run UMAP
# celllines.intergration <- RunUMAP(celllines.intergration,
#                            dims = 1:min.pc,
#                            reduction.use = "pca",
#                            n.components = 3) # set to 3 to use with VR


# Determine the K-nearest neighbor graph
#celllines.intergration <- FindNeighbors(object = celllines.intergration,dims = 1:min.pc)


# Determine the clusters for various resolutions                                
#cell_line.unannotated <- FindClusters(object = celllines.intergration,resolution = seq(0.1,1.2,by=0.1))



# Neurons after re-cluster
# UMAP
DefaultAssay(cell_line.unannotated) <- "SCT"
#neuron.unannotated <- NormalizeData(neuron.unannotated, verbose = FALSE)
cell_line.unannotated$seurat_clusters <- cell_line.unannotated$SCT_snn_res.0.7

u1 <- dittoDimPlot(object = cell_line.unannotated,
             var = "seurat_clusters",
             reduction.use = "umap",
             do.label = TRUE,
             labels.highlight = FALSE)
u1

u2 <- dittoDimPlot(object = cell_line.unannotated,
             var = "seurat_clusters",
             reduction.use = "umap",
             do.label = TRUE,
             split.by = "Treatments_analysis",
             labels.highlight = FALSE)
u2


```


```{r C2, fig.height=4, fig.width=6}

# #Cluster tree
# cluster_colors <- c("gold","firebrick1","dodgerblue","green",
#                     "cyan","chocolate4","gray40","purple", "blue")
# 
# celllines.nointergration <- BuildClusterTree(object = celllines.nointergration,
#                                      dims = 1:min.pc,
#                                      reorder = FALSE,
#                                      reorder.numeric = FALSE)
# 
# tree <- celllines.nointergration@tools$BuildClusterTree
# tree$tip.label <- paste0("Cluster ", tree$tip.label)
# 
# p <- ggtree::ggtree(tree, aes(x, y)) +
#   scale_y_reverse() +
#   ggtree::geom_tree() +
#   ggtree::theme_tree() +
#   ggtree::geom_tiplab(offset = 1) +
#   ggtree::geom_tippoint(color = cluster_colors[1:length(tree$tip.label)], shape = 16, size = 5) +
#   coord_cartesian(clip = 'off') +
#   theme(plot.margin = unit(c(0,2.5,0,0), 'cm'))

```


# Save the Seurat object as an Robj file----------------------------------------------------------------------
```{r saveRDS}


# save
#save(cell_line.unannotated,file = "15-cell_line.unannotated_1.Robj")

```

```



