1. load libraries

Loading required package: SeuratObject
Loading required package: sp

Attaching package: 'SeuratObject'
The following objects are masked from 'package:base':

    intersect, t
── Installed datasets ──────────────────────────────── SeuratData v0.2.2.9001 ──
✔ pbmcref 1.0.0                         ✔ pbmcsca 3.0.0
────────────────────────────────────── Key ─────────────────────────────────────
✔ Dataset loaded successfully
❯ Dataset built with a newer version of Seurat than installed
❓ Unknown version of Seurat installed

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: 'magrittr'


The following object is masked from 'package:purrr':

    set_names


The following object is masked from 'package:tidyr':

    extract



Attaching package: 'dbplyr'


The following objects are masked from 'package:dplyr':

    ident, sql


Registered S3 method overwritten by 'SeuratDisk':
  method            from  
  as.sparse.H5Group Seurat



Attaching shinyBS

Loading required package: ggraph


Attaching package: 'ggraph'


The following object is masked from 'package:sp':

    geometry

2. Load Seurat Object

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("0-imp_Robj/SS_CD4_Tcells_Azimuth_Annotated_PBMC10x_excluding_nonCD4_cells_from_Control_Bcells_from_L4_and_ILC_NK_just_oneCell_CD14Mono.robj")

All_samples_Merged <- filtered_seurat

Summarizing Seurat Object

# Load necessary libraries
library(Seurat)

# Display basic metadata summary
head(All_samples_Merged@meta.data)
# Check if columns such as `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `nUMI`, `ngene`, and any other necessary columns exist
required_columns <- c("orig.ident", "nCount_RNA", "nFeature_RNA", "nUMI", "ngene")
missing_columns <- setdiff(required_columns, colnames(All_samples_Merged@meta.data))

if (length(missing_columns) > 0) {
    cat("Missing columns:", paste(missing_columns, collapse = ", "), "\n")
} else {
    cat("All required columns are present.\n")
}
All required columns are present.
# Check cell counts and features
cat("Number of cells:", ncol(All_samples_Merged), "\n")
Number of cells: 49360 
cat("Number of features:", nrow(All_samples_Merged), "\n")
Number of features: 26179 
# Verify that each `orig.ident` label has the correct number of cells
cat("Cell counts per group:\n")
Cell counts per group:
print(table(All_samples_Merged$orig.ident))

     L1      L2      L3      L4      L5      L6      L7    PBMC PBMC10x 
   5825    5935    6427    6007    6017    5148    5325    5171    3505 
# Check that the cell IDs are unique (which ensures no issues from merging)
if (any(duplicated(colnames(All_samples_Merged)))) {
    cat("Warning: There are duplicated cell IDs.\n")
} else {
    cat("Cell IDs are unique.\n")
}
Cell IDs are unique.
# Check the assay consistency for RNA
DefaultAssay(All_samples_Merged) <- "RNA"

# Check dimensions of the RNA counts layer using the new method
cat("Dimensions of the RNA counts layer:", dim(GetAssayData(All_samples_Merged, layer = "counts")), "\n")
Dimensions of the RNA counts layer: 36601 49360 
cat("Dimensions of the RNA data layer:", dim(GetAssayData(All_samples_Merged, layer = "data")), "\n")
Dimensions of the RNA data layer: 36601 49360 
# Check the ADT assay (optional)
if ("ADT" %in% names(All_samples_Merged@assays)) {
    cat("ADT assay is present.\n")
    cat("Dimensions of the ADT counts layer:", dim(GetAssayData(All_samples_Merged, assay = "ADT", layer = "counts")), "\n")
} else {
    cat("ADT assay is not present.\n")
}
ADT assay is present.
Dimensions of the ADT counts layer: 56 49360 

Azimuth Annotation

# InstallData("pbmcref")
# 
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")

3. QC

# Remove the percent.mito column
All_samples_Merged$percent.mito <- NULL
Warning: Cannot find cell-level meta data named percent.mito
# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "cell_line"

All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged, 
                                                           pattern = "^RP[SL]")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.rb <- replace(as.numeric(All_samples_Merged$percent.rb), is.na(All_samples_Merged$percent.rb), 0)



# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
All_samples_Merged[["percent.mt"]] <- PercentageFeatureSet(All_samples_Merged, pattern = "^MT-")

# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.mt <- replace(as.numeric(All_samples_Merged$percent.mt), is.na(All_samples_Merged$percent.mt), 0)





VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))

FeatureScatter(All_samples_Merged, feature1 = "percent.mt", 
                                  feature2 = "percent.rb")

VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)

FeatureScatter(All_samples_Merged, 
               feature1 = "percent.mt", 
               feature2 = "percent.rb") +
        geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
        geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'

Assign Cell-Cycle Scores

Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 26174 by 49360
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 469 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 26174 genes
Computing corrected count matrix for 26174 genes
Calculating gene attributes
Wall clock passed: Time difference of 8.486507 mins
Determine variable features
Getting residuals for block 1(of 10) for counts dataset
Getting residuals for block 2(of 10) for counts dataset
Getting residuals for block 3(of 10) for counts dataset
Getting residuals for block 4(of 10) for counts dataset
Getting residuals for block 5(of 10) for counts dataset
Getting residuals for block 6(of 10) for counts dataset
Getting residuals for block 7(of 10) for counts dataset
Getting residuals for block 8(of 10) for counts dataset
Getting residuals for block 9(of 10) for counts dataset
Getting residuals for block 10(of 10) for counts dataset
Finished calculating residuals for counts
Warning: Different cells and/or features from existing assay SCT
Set default assay to SCT
Warning: The following features are not present in the object: MLF1IP, not
searching for symbol synonyms
Warning: The following features are not present in the object: FAM64A, HN1, not
searching for symbol synonyms

4. Normalize data

# Apply SCTransform
All_samples_Merged <- SCTransform(All_samples_Merged, 
                                  vars.to.regress = c("percent.rb","percent.mt", "CC.Difference", "nCount_RNA","nFeature_RNA"), 
                                  do.scale=TRUE, 
                                  do.center=TRUE, 
                                  verbose = TRUE)
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 26174 by 49360
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 469 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 26174 genes
Computing corrected count matrix for 26174 genes
Calculating gene attributes
Wall clock passed: Time difference of 6.91823 mins
Determine variable features
Regressing out percent.rb, percent.mt, CC.Difference, nCount_RNA, nFeature_RNA
Centering and scaling data matrix
Getting residuals for block 1(of 10) for counts dataset
Getting residuals for block 2(of 10) for counts dataset
Getting residuals for block 3(of 10) for counts dataset
Getting residuals for block 4(of 10) for counts dataset
Getting residuals for block 5(of 10) for counts dataset
Getting residuals for block 6(of 10) for counts dataset
Getting residuals for block 7(of 10) for counts dataset
Getting residuals for block 8(of 10) for counts dataset
Getting residuals for block 9(of 10) for counts dataset
Getting residuals for block 10(of 10) for counts dataset
Regressing out percent.rb, percent.mt, CC.Difference, nCount_RNA, nFeature_RNA
Centering and scaling data matrix
Finished calculating residuals for counts
Set default assay to SCT

5. Perform PCA

Variables_genes <- All_samples_Merged@assays$SCT@var.features

# Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]

# Set the seed for clustering steps
set.seed(123)

# These are now standard steps in the Seurat workflow for visualization and clustering
All_samples_Merged <- RunPCA(All_samples_Merged,
                        features = Variables_genes_after_exclusion,
                        do.print = TRUE, 
                        pcs.print = 1:5, 
                        genes.print = 15,
                        npcs = 50)
PC_ 1 
Positive:  KIR3DL1, CD7, PRKCH, SEPTIN9, KIR2DL3, EPCAM, CST7, MATK, ESYT2, XCL1 
       MYO1E, TRGV2, KLRC1, CLEC2B, KIR2DL4, GZMM, KLRK1, KIR3DL2, CXCR3, PTPN6 
       CD3G, KRT86, XCL2, TOX, PTPRC, ZBTB16, CD96, CD52, CD5, KRT81 
Negative:  NPM1, SEC11C, YBX3, MTHFD2, MTDH, IL2RA, VDAC1, RBM17, HDGFL3, CD74 
       VIM, CCND2, CCT8, HINT2, BATF3, RAD21, C12orf75, GAPDH, KRT7, SPATS2L 
       PRELID1, HTATIP2, MYL6, PRDX1, MIR155HG, MIIP, RAN, RDH10, MINDY3, PKM 
PC_ 2 
Positive:  PAGE5, RPL35A, NDUFV2, RBPMS, CD74, CDKN2A, TENM3, RPL22L1, KIF2A, LMNA 
       RPL11, RPS3A, PSMB9, PLD1, STMN1, SH3KBP1, PPP2R2B, ANXA5, VAMP5, FAM50B 
       FAM241A, STAT1, RPS14, GPX4, SPOCK1, IFI27L2, HEBP2, ZC2HC1A, ANXA2, PSMB2 
Negative:  C12orf75, TNFRSF4, HACD1, EGFL6, TIGIT, RPS17, BACE2, ARPC2, LY6E, PARK7 
       CYBA, SYT4, SCCPDH, ENO1, GGH, NET1, ATP5MC1, APRT, PTP4A3, PXYLP1 
       CCL17, PON2, GRIA4, UBE2D2, MAP1B, EEF1A2, MRPS6, ADGRB3, KRT7, RHOC 
PC_ 3 
Positive:  RPL39, RPL30, RPS27, MALAT1, RPS4Y1, RPL34, MT-ND3, FYB1, RPS29, ETS1 
       BTG1, TCF7, TPT1, SARAF, ZBTB20, SELL, IL7R, TXNIP, LINC00861, RIPOR2 
       CAMK4, MAML2, PNRC1, PIK3IP1, CYTH1, ANK3, THEMIS, MT-ND5, PTPRC, PDE7A 
Negative:  NME2, PFN1, MIF, RPL35, CHCHD2, ATP5MC3, RPS15, C1QBP, RPL27A, NDUFA4 
       EIF4A1, ACTB, KIR3DL2, COX6A1, RPL19, MDH2, CLIC1, KIR3DL1, NME1, PPIA 
       MT-CO2, EIF5A, DAD1, CST7, KIR2DL3, HMGN2, TRGV2, RAB25, EPCAM, TUBA1B 
PC_ 4 
Positive:  RPS4X, GAS5, KRT1, EGLN3, LINC02752, TTC29, TBX4, WFDC1, RPLP1, RPL13 
       PLCB1, AC069410.1, IFNGR1, TNS4, SP5, IL32, FAM9C, SEMA4A, NKG7, IL4 
       S100A11, LINC00469, RPLP0, PTGIS, S100A4, CEBPD, HSPB1, VIPR2, S100A6, NPTX1 
Negative:  HSPE1, EIF5A, RPL34, ATP5MC3, RPS4Y1, MT-ND3, ODC1, CYC1, CHCHD10, CYCS 
       HSPD1, RPL39, PPBP, RPS29, PPID, GCSH, FKBP4, HSP90AA1, GSTP1, RPL30 
       FCER2, MT-ND1, TCF7, FAM162A, DNAJC12, ATP5F1B, TOMM40, CD7, PRELID3B, CCT6A 
PC_ 5 
Positive:  TMSB4X, S100A4, TMSB10, LGALS1, COTL1, S100A6, S100A11, IFITM2, TP73, LSP1 
       GPAT3, TMEM163, DUSP4, CRIP1, LIME1, HOXC9, GAS2L1, TAGLN2, CARHSP1, GPAT2 
       TNFRSF18, ENTPD1, LAPTM5, EEF1A2, QPRT, PRDX5, F2R, HIST1H1C, IFITM1, PXYLP1 
Negative:  CCL17, MIR155HG, MAP4K4, RXFP1, MYO1D, CA10, CFI, CA2, PRKCA, FRMD4A 
       NFIB, LRBA, LTA, THY1, SLC35F3, IMMP2L, AL590550.1, RUNX1, HS3ST1, SNTB1 
       RANBP17, CCL5, AKAP12, IGHE, AC100801.1, NME2, DOCK10, EZH2, MGST3, AFP 
# determine dimensionality of the data
ElbowPlot(All_samples_Merged, ndims = 50)

6. Perform PCA TEST

library(ggplot2)
library(RColorBrewer)  

# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")

# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI")  # Change column name to nUMI

ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nUMI), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25) +
  scale_fill_manual(values = cell_line_colors) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +  # Adjust the title position
  ggtitle("Filtered cells per sample") +
  xlab("Cell lines") +  # Adjust x-axis label
  ylab("Frequency")    # Adjust y-axis label

print(ncells)

# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 16
# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 16
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

7. Clustering

# Set the seed for clustering steps
set.seed(123)

All_samples_Merged <- FindNeighbors(All_samples_Merged, 
                                dims = 1:16, 
                                verbose = FALSE)

# understanding resolution
All_samples_Merged <- FindClusters(All_samples_Merged, 
                                    resolution = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1,1.2,1.5,2))
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9866
Number of communities: 11
Elapsed time: 17 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9760
Number of communities: 12
Elapsed time: 27 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9656
Number of communities: 13
Elapsed time: 23 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9554
Number of communities: 14
Elapsed time: 18 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9458
Number of communities: 15
Elapsed time: 18 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9373
Number of communities: 20
Elapsed time: 17 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9290
Number of communities: 20
Elapsed time: 18 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9209
Number of communities: 20
Elapsed time: 20 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9133
Number of communities: 23
Elapsed time: 24 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9060
Number of communities: 23
Elapsed time: 26 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8944
Number of communities: 27
Elapsed time: 28 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8805
Number of communities: 34
Elapsed time: 31 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1623362

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8615
Number of communities: 40
Elapsed time: 21 seconds

UMAP Visualization

# Set the seed for clustering steps
set.seed(123)

# non-linear dimensionality reduction --------------
All_samples_Merged <- RunUMAP(All_samples_Merged, 
                          dims = 1:16,
                          verbose = FALSE)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
# note that you can set `label = TRUE` or use the Label Clusters function to help label
# individual clusters
DimPlot(All_samples_Merged,group.by = "cell_line", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.1",
        reduction = "umap",
        label.size = 3,
        repel = T, 
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.2",
        reduction = "umap",
        label.size = 3,
        repel = T, 
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.3",
        reduction = "umap",
        label.size = 3,
        repel = T, 
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.4", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.5", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.6", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.7", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.8", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.9", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.1.2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.1.5", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "SCT_snn_res.0.9"

cluster_table <- table(Idents(All_samples_Merged))


barplot(cluster_table, main = "Number of Cells in Each Cluster", 
                      xlab = "Cluster", 
                      ylab = "Number of Cells", 
                      col = rainbow(length(cluster_table)))

print(cluster_table)

   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
6254 5814 4930 4563 3378 3346 3342 3218 2684 2408 2245 1903 1745  841  502  455 
  16   17   18   19   20   21   22 
 422  412  329  201  197  116   55 
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.1)
                   
                       0    1    2    3    4    5    6    7    8    9   10
  B intermediate       1    3    0    0    0    2    0    1    0    0    0
  B memory             9   19    1    0  113   82    0   27    0    1    0
  CD4 CTL              0    0    0    0    0    0   12    0    0    0    1
  CD4 Naive            0    7    0    0    0    0  522    0 1512    0    1
  CD4 Proliferating 5446 2484 5390 2852 4152 4050    1 3282    7 1347    0
  CD4 TCM            878 3426  522  268  465  586 4478  114 1999   41   53
  CD4 TEM              0    1    0    0    0    0   62    0   21    0    0
  CD8 Proliferating    0    0    0    0    1    1    0    0    0    0    0
  CD8 TCM              0    1    0   16    0    0    0    0    0    0    0
  CD8 TEM              0    1    0    8    1    3    0    2    0    0    0
  cDC1                 0    0    0    0    0    6    0    2    0    0    0
  cDC2                 0    3    2    0   37    3    0    8    0    0    0
  dnT                  0    3    1    1    3    4    0    2    1    0    0
  HSPC                57    9    1    0  490  213    1  683    0  351    0
  NK Proliferating     4   40   23 2785   36  262    0   10    0    1    0
  Treg                15   15    1    0    0   13    0    0    1    0    0

8. clusTree

clustree(All_samples_Merged, prefix = "SCT_snn_res.")

9. Azimuth Annotation

# InstallData("pbmcref")
# 
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")

10. Azimuth Visualization

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
                   
                       0    1    2    3    4    5    6    7    8    9   10   11
  B intermediate       4    0    0    0    0    2    0    1    0    0    0    0
  B memory            13    1    0   14  113   83    0   25    0    3    0    0
  CD4 CTL              0    0    0    0    0    0   12    0    0    0    0    1
  CD4 Naive            7    0    0    0    0    0  522    0 1479    0   33    1
  CD4 Proliferating 5447 5390 2849 2480 4151 4053    1 3235    6 1395    4    0
  CD4 TCM            930  522  266 3349  465  608 4480  111 1837   44  165   53
  CD4 TEM              0    0    0    1    0    0   62    0   21    0    0    0
  CD8 Proliferating    0    0    0    0    1    1    0    0    0    0    0    0
  CD8 TCM              0    0   16    1    0    0    0    0    0    0    0    0
  CD8 TEM              0    0    6    1    1    3    0    2    0    0    2    0
  cDC1                 0    0    0    0    0    6    0    2    0    0    0    0
  cDC2                 0    2    0    3   37    3    0    8    0    0    0    0
  dnT                  0    1    0    1    3    5    0    2    0    0    3    0
  HSPC                62    1    0    3  490  214    1  675    0  359    0    0
  NK Proliferating     4   23 2785   40   36  262    0   10    0    1    0    0
  Treg                17    1    0    1    0   25    0    0    0    0    1    0

Save the Seurat object as an Robj file

save(All_samples_Merged, file = "0-imp_Robj/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration_removed_nonCD4cells_from_control_and_Bcells_from_L4_ILC_NK_CD14Mono_ready_for_Harmony.robj")

11. Harmony Integration

# Load required libraries
library(Seurat)
library(harmony)
Loading required package: Rcpp
library(ggplot2)

# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
All_samples_Merged <- RunHarmony(
  All_samples_Merged,
  group.by.vars = "cell_line",  # Replace with the metadata column specifying batch or cell line
  assay.use="SCT")
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
Harmony 2/10
Harmony converged after 2 iterations
# Check results in harmony embeddings
harmony_embeddings <- Embeddings(All_samples_Merged, reduction = "harmony")
head(harmony_embeddings)
                       harmony_1  harmony_2  harmony_3 harmony_4  harmony_5
L1_AAACCTGAGGGCTTCC-1 -1.6599519 -0.6042078 -2.1767671  7.437821  2.0512545
L1_AAACCTGGTGCAGGTA-1 11.4194402 -1.5675935  1.4413074 12.644508 -2.3901265
L1_AAACCTGGTTAAAGTG-1 10.8595239 -2.1892066 -3.9696178  7.474795  0.7275732
L1_AAACCTGTCAGGTAAA-1 -0.9505162 -2.9662790 -8.1845397  1.296843  1.4421326
L1_AAACCTGTCCCTGACT-1 -0.8488519  0.1114262  0.3429697  4.279203  1.7309959
L1_AAACCTGTCCTTCAAT-1 14.5053396 -0.5796922  7.7985637  5.899191 -4.5096397
                       harmony_6  harmony_7  harmony_8  harmony_9 harmony_10
L1_AAACCTGAGGGCTTCC-1 -1.2423685 -1.4986240 -2.7549075  6.6745310  -3.658267
L1_AAACCTGGTGCAGGTA-1 -6.4736791 -0.6286608  0.7579979  1.7228589   2.244277
L1_AAACCTGGTTAAAGTG-1 -3.3793447 -0.9384437 -4.8161777 -2.9374133   2.643614
L1_AAACCTGTCAGGTAAA-1  0.8011309 -0.1859293 -1.7865230 -2.7456625   3.106090
L1_AAACCTGTCCCTGACT-1 -0.6455216 -0.7015163 -0.9235505  3.9217212  -1.709181
L1_AAACCTGTCCTTCAAT-1 -3.6686882 -5.0254989 -4.5074255  0.9570588  -2.854292
                      harmony_11  harmony_12   harmony_13 harmony_14 harmony_15
L1_AAACCTGAGGGCTTCC-1  -1.322096 -1.16546592 -0.764112903 -2.1137972  0.0133156
L1_AAACCTGGTGCAGGTA-1   2.222533 -2.01864065 -0.172401947  0.6180905  0.9612908
L1_AAACCTGGTTAAAGTG-1  -3.599226  1.40836783  2.008425810  3.8432841  4.1550565
L1_AAACCTGTCAGGTAAA-1  -1.076842  1.94986597  0.805000046  2.0696300  0.4127603
L1_AAACCTGTCCCTGACT-1  -1.217175 -0.04311119 -0.395346221 -2.1573804 -1.9110388
L1_AAACCTGTCCTTCAAT-1  -1.205150 -1.76806134 -0.008277499  0.5937775 -0.7083041
                      harmony_16   harmony_17 harmony_18 harmony_19 harmony_20
L1_AAACCTGAGGGCTTCC-1 -1.4297335 -0.272608582  0.2907540  2.1970598   5.409374
L1_AAACCTGGTGCAGGTA-1  0.9363010  0.006760549 -1.1543828 -2.5087616   5.555734
L1_AAACCTGGTTAAAGTG-1 -0.1973937 -3.183598315 -0.8443751 -1.5115294   2.011382
L1_AAACCTGTCAGGTAAA-1  0.6498343  0.383791567 -0.9146736 -0.2299789  -3.869947
L1_AAACCTGTCCCTGACT-1 -4.4623498 -3.781479910  0.8255265  4.4057988   1.253731
L1_AAACCTGTCCTTCAAT-1 -0.9358707 -0.996784853  0.1241228 -0.8460764   1.944907
                      harmony_21 harmony_22 harmony_23  harmony_24 harmony_25
L1_AAACCTGAGGGCTTCC-1 -2.3410732  7.9293463  4.5220061  3.60975279  1.2776873
L1_AAACCTGGTGCAGGTA-1 -1.6621798  4.2146595  0.9106451 -0.90629578  1.4886328
L1_AAACCTGGTTAAAGTG-1 -2.6231293  3.3725710  2.7871020  0.02608648  0.3361270
L1_AAACCTGTCAGGTAAA-1  0.8092308 -0.7023921 -1.3844510 -0.04823276 -0.8863482
L1_AAACCTGTCCCTGACT-1 -2.3847306  3.5108056  1.4364981  0.47188719 -0.3891612
L1_AAACCTGTCCTTCAAT-1 -1.5506749  4.7843711  0.4238190  4.07368448 -0.2228818
                       harmony_26   harmony_27 harmony_28 harmony_29 harmony_30
L1_AAACCTGAGGGCTTCC-1  0.22012527 -1.250994471  0.1475193  1.3827704 -0.2118126
L1_AAACCTGGTGCAGGTA-1  0.71002127 -0.006560484 -0.8059567 -0.9707122 -1.4538334
L1_AAACCTGGTTAAAGTG-1  1.17543361 -1.246475772  1.0056730  2.0215198 -0.2995155
L1_AAACCTGTCAGGTAAA-1  0.86500268 -0.191931945 -0.3850697 -1.8405108 -0.9803550
L1_AAACCTGTCCCTGACT-1 -1.82773708  0.530698436  0.9330416  1.6543250  1.0844918
L1_AAACCTGTCCTTCAAT-1 -0.03096043 -1.968962804  0.3313785 -1.7702291 -1.2887631
                      harmony_31 harmony_32 harmony_33 harmony_34 harmony_35
L1_AAACCTGAGGGCTTCC-1  1.9948229  0.1374334  1.9588237  0.8482296 -2.0136414
L1_AAACCTGGTGCAGGTA-1 -1.4644421  1.0951582 -2.7021803 -0.9521190  1.8023297
L1_AAACCTGGTTAAAGTG-1  3.2674166 -0.6291023  0.6443879  0.7580062 -1.0470746
L1_AAACCTGTCAGGTAAA-1 -0.5629333  1.2345308 -0.8800303  0.8529725 -1.3772804
L1_AAACCTGTCCCTGACT-1  1.5498537 -1.8495021  3.1636526  1.3887400 -0.9734873
L1_AAACCTGTCCTTCAAT-1  1.1713271  0.7644674 -1.7100090  1.5209922 -2.6954757
                      harmony_36 harmony_37 harmony_38 harmony_39 harmony_40
L1_AAACCTGAGGGCTTCC-1  0.0591596 -0.5522977 -1.4096069 -0.9937449 -0.1883725
L1_AAACCTGGTGCAGGTA-1 -1.5232604  0.3447356  0.7329088 -1.2555711 -1.3565734
L1_AAACCTGGTTAAAGTG-1  1.3533220  0.4352095  0.7616906  0.4871637  0.4225370
L1_AAACCTGTCAGGTAAA-1  0.1677546 -0.4515435  0.9398611  0.6255458  0.3996444
L1_AAACCTGTCCCTGACT-1 -1.3870516 -0.3571341 -3.4661030 -0.3901332  1.2014202
L1_AAACCTGTCCTTCAAT-1 -2.2975146  0.3973966 -0.1197209 -0.8249204 -0.9024280
                      harmony_41 harmony_42 harmony_43 harmony_44  harmony_45
L1_AAACCTGAGGGCTTCC-1  0.4016596  0.1595270 0.43923229 -0.2485869  1.58714586
L1_AAACCTGGTGCAGGTA-1  1.0716623 -1.5143261 0.06346089  0.8082673  0.08620501
L1_AAACCTGGTTAAAGTG-1 -1.3731975 -1.1118409 0.14377751 -0.4626890 -1.96623295
L1_AAACCTGTCAGGTAAA-1  0.9125830  1.1133472 0.60541745 -2.1485228 -0.87005765
L1_AAACCTGTCCCTGACT-1 -2.6284258  1.0738953 1.23452387 -0.2750574  0.73764963
L1_AAACCTGTCCTTCAAT-1 -2.9080001  0.3364856 2.14097555 -2.0625919  0.75785767
                       harmony_46 harmony_47 harmony_48 harmony_49 harmony_50
L1_AAACCTGAGGGCTTCC-1 -1.24689684 -1.0149490 -0.2588864  1.0355046 -0.4284293
L1_AAACCTGGTGCAGGTA-1  0.73143044  1.4290411 -0.3547001  1.6450457  0.4239118
L1_AAACCTGGTTAAAGTG-1 -1.86318391  0.4023728 -2.5357028 -1.4524201  1.0623882
L1_AAACCTGTCAGGTAAA-1  0.07497276  1.3905798 -0.9033516 -0.6815336  0.5546704
L1_AAACCTGTCCCTGACT-1 -1.43412240 -0.4466466  1.5685796  0.3959515 -0.5471844
L1_AAACCTGTCCTTCAAT-1 -1.38441167  0.9351336 -1.0341636 -2.0737713  1.2920963
# Set the seed for clustering steps
set.seed(123)

# Run UMAP on Harmony embeddings
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:16)
17:04:44 UMAP embedding parameters a = 0.9922 b = 1.112
17:04:44 Read 49360 rows and found 16 numeric columns
17:04:44 Using Annoy for neighbor search, n_neighbors = 30
17:04:44 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
17:04:50 Writing NN index file to temp file /tmp/RtmpmpbTET/file1418237503969
17:04:50 Searching Annoy index using 1 thread, search_k = 3000
17:05:09 Annoy recall = 100%
17:05:12 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
17:05:18 Initializing from normalized Laplacian + noise (using RSpectra)
17:05:20 Commencing optimization for 200 epochs, with 2086400 positive edges
17:06:31 Optimization finished
# Set the seed for clustering steps
set.seed(123)

# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
All_samples_Merged <- FindNeighbors(All_samples_Merged, reduction = "harmony", dims = 1:16)
Computing nearest neighbor graph
Computing SNN
All_samples_Merged <- FindClusters(All_samples_Merged, resolution = 0.5)  # Adjust resolution as needed
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49360
Number of edges: 1494942

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8819
Number of communities: 15
Elapsed time: 19 seconds
# Visualize UMAP
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP of Harmony-Integrated Data")

# Visualize UMAP with batch/cell line information
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")

# Visualize UMAP with clusters
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Clustered Data (After Harmony Integration)")

# Visualize specific cell types or other metadata
DimPlot(All_samples_Merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Cell Types After Harmony Integration")

12.Save the Seurat object as an Robj file

#save(All_samples_Merged, file = "../../../0-IMP-OBJECTS/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration.robj")
---
title: "Merged All samples with PBMC_10x and removed non CD4 T cells from Control and B cells from L4 and ILC, NK, CD14 Mono and regress nCountRNA and nFeatureRNA and apply SCT"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, echo=FALSE}

library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)

library(dplyr)
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(magrittr)
library(dbplyr)
library(rmarkdown)
library(knitr)
library(tinytex)
#Azimuth Annotation libraries
library(Azimuth)

library(clustree)


```


# 2. Load Seurat Object 
```{r load_seurat}

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("0-imp_Robj/SS_CD4_Tcells_Azimuth_Annotated_PBMC10x_excluding_nonCD4_cells_from_Control_Bcells_from_L4_and_ILC_NK_just_oneCell_CD14Mono.robj")

All_samples_Merged <- filtered_seurat
 
```

## Summarizing Seurat Object
```{r summary, fig.height=6, fig.width=10}

# Load necessary libraries
library(Seurat)

# Display basic metadata summary
head(All_samples_Merged@meta.data)

# Check if columns such as `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `nUMI`, `ngene`, and any other necessary columns exist
required_columns <- c("orig.ident", "nCount_RNA", "nFeature_RNA", "nUMI", "ngene")
missing_columns <- setdiff(required_columns, colnames(All_samples_Merged@meta.data))

if (length(missing_columns) > 0) {
    cat("Missing columns:", paste(missing_columns, collapse = ", "), "\n")
} else {
    cat("All required columns are present.\n")
}

# Check cell counts and features
cat("Number of cells:", ncol(All_samples_Merged), "\n")
cat("Number of features:", nrow(All_samples_Merged), "\n")

# Verify that each `orig.ident` label has the correct number of cells
cat("Cell counts per group:\n")
print(table(All_samples_Merged$orig.ident))

# Check that the cell IDs are unique (which ensures no issues from merging)
if (any(duplicated(colnames(All_samples_Merged)))) {
    cat("Warning: There are duplicated cell IDs.\n")
} else {
    cat("Cell IDs are unique.\n")
}

# Check the assay consistency for RNA
DefaultAssay(All_samples_Merged) <- "RNA"

# Check dimensions of the RNA counts layer using the new method
cat("Dimensions of the RNA counts layer:", dim(GetAssayData(All_samples_Merged, layer = "counts")), "\n")
cat("Dimensions of the RNA data layer:", dim(GetAssayData(All_samples_Merged, layer = "data")), "\n")

# Check the ADT assay (optional)
if ("ADT" %in% names(All_samples_Merged@assays)) {
    cat("ADT assay is present.\n")
    cat("Dimensions of the ADT counts layer:", dim(GetAssayData(All_samples_Merged, assay = "ADT", layer = "counts")), "\n")
} else {
    cat("ADT assay is not present.\n")
}


```

## Azimuth Annotation
```{r azimuth_Annotation1, fig.height=6, fig.width=10}
# InstallData("pbmcref")
# 
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")

```

# 3. QC
```{r QC, fig.height=6, fig.width=10}

# Remove the percent.mito column
All_samples_Merged$percent.mito <- NULL


# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "cell_line"

All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged, 
                                                           pattern = "^RP[SL]")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.rb <- replace(as.numeric(All_samples_Merged$percent.rb), is.na(All_samples_Merged$percent.rb), 0)



# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
All_samples_Merged[["percent.mt"]] <- PercentageFeatureSet(All_samples_Merged, pattern = "^MT-")

# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.mt <- replace(as.numeric(All_samples_Merged$percent.mt), is.na(All_samples_Merged$percent.mt), 0)





VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))

FeatureScatter(All_samples_Merged, feature1 = "percent.mt", 
                                  feature2 = "percent.rb")

VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)

FeatureScatter(All_samples_Merged, 
               feature1 = "percent.mt", 
               feature2 = "percent.rb") +
        geom_smooth(method = 'lm')

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
        geom_smooth(method = 'lm')

```

##FeatureScatter is typically used to visualize feature-feature relationships
##for anything calculated by the object, 
##i.e. columns in object metadata, PC scores etc.

```{r FC, fig.height=6, fig.width=10}

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

```


##  Assign Cell-Cycle Scores
```{r Regress, echo=FALSE, fig.height=6, fig.width=10}
options(future.globals.maxSize = 8000 * 1024^2)  # Set to 8000 MiB (about 8 GB)


All_samples_Merged <- SCTransform(All_samples_Merged, 
                                   do.scale = FALSE, 
                                   do.center = FALSE)  # Reduce to 1000 variable features


# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes


All_samples_Merged <- CellCycleScoring(All_samples_Merged, 
                                       s.features = s.genes, 
                                       g2m.features = g2m.genes, 
                                       set.ident = TRUE)

DefaultAssay(All_samples_Merged) <- "RNA"
All_samples_Merged$CC.Difference <- All_samples_Merged$S.Score - All_samples_Merged$G2M.Score

```


# 4. Normalize data
```{r Normalize, include=TRUE}


# Apply SCTransform
All_samples_Merged <- SCTransform(All_samples_Merged, 
                                  vars.to.regress = c("percent.rb","percent.mt", "CC.Difference", "nCount_RNA","nFeature_RNA"), 
                                  do.scale=TRUE, 
                                  do.center=TRUE, 
                                  verbose = TRUE)
                                      
```


# 5. Perform PCA
```{r PCA, fig.height=6, fig.width=10}

Variables_genes <- All_samples_Merged@assays$SCT@var.features

# Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]

# Set the seed for clustering steps
set.seed(123)

# These are now standard steps in the Seurat workflow for visualization and clustering
All_samples_Merged <- RunPCA(All_samples_Merged,
                        features = Variables_genes_after_exclusion,
                        do.print = TRUE, 
                        pcs.print = 1:5, 
                        genes.print = 15,
                        npcs = 50)

# determine dimensionality of the data
ElbowPlot(All_samples_Merged, ndims = 50)


```

# 6. Perform PCA TEST
```{r PCA-TEST, fig.height=6, fig.width=10}


library(ggplot2)
library(RColorBrewer)  

# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")

# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI")  # Change column name to nUMI

ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nUMI), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25) +
  scale_fill_manual(values = cell_line_colors) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +  # Adjust the title position
  ggtitle("Filtered cells per sample") +
  xlab("Cell lines") +  # Adjust x-axis label
  ylab("Frequency")    # Adjust y-axis label

print(ncells)



# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2

# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

  

```

# 7. Clustering
```{r C1, fig.height=6, fig.width=10}

# Set the seed for clustering steps
set.seed(123)

All_samples_Merged <- FindNeighbors(All_samples_Merged, 
                                dims = 1:16, 
                                verbose = FALSE)

# understanding resolution
All_samples_Merged <- FindClusters(All_samples_Merged, 
                                    resolution = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1,1.2,1.5,2))


```

## UMAP Visualization
```{r C2, fig.height=6, fig.width=10}
# Set the seed for clustering steps
set.seed(123)

# non-linear dimensionality reduction --------------
All_samples_Merged <- RunUMAP(All_samples_Merged, 
                          dims = 1:16,
                          verbose = FALSE)
                                  

# note that you can set `label = TRUE` or use the Label Clusters function to help label
# individual clusters
DimPlot(All_samples_Merged,group.by = "cell_line", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.1",
        reduction = "umap",
        label.size = 3,
        repel = T, 
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.2",
        reduction = "umap",
        label.size = 3,
        repel = T, 
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.3",
        reduction = "umap",
        label.size = 3,
        repel = T, 
        label = T, label.box = T)


DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.4", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)


DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.5", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.6", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.7", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.8", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.0.9", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.1.2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.1.5", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged,
        group.by = "SCT_snn_res.2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "SCT_snn_res.0.9"

cluster_table <- table(Idents(All_samples_Merged))


barplot(cluster_table, main = "Number of Cells in Each Cluster", 
                      xlab = "Cluster", 
                      ylab = "Number of Cells", 
                      col = rainbow(length(cluster_table)))

print(cluster_table)

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.1)
```

# 8. clusTree
```{r clusTree, fig.height=12, fig.width=10}
clustree(All_samples_Merged, prefix = "SCT_snn_res.")
```

# 9. Azimuth Annotation
```{r azimuth_Annotation2, fig.height=6, fig.width=10}
# InstallData("pbmcref")
# 
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")

```

# 10. Azimuth Visualization
```{r azimuth_Visualization, fig.height=6, fig.width=10}
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)


DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)



table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
```

## Save the Seurat object as an Robj file
```{r saveROBJ1, echo=TRUE}

save(All_samples_Merged, file = "0-imp_Robj/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration_removed_nonCD4cells_from_control_and_Bcells_from_L4_ILC_NK_CD14Mono_ready_for_Harmony.robj")


```


# 11. Harmony Integration
```{r harmony, fig.height=6, fig.width=10}


# Load required libraries
library(Seurat)
library(harmony)
library(ggplot2)

# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
All_samples_Merged <- RunHarmony(
  All_samples_Merged,
  group.by.vars = "cell_line",  # Replace with the metadata column specifying batch or cell line
  assay.use="SCT")

# Check results in harmony embeddings
harmony_embeddings <- Embeddings(All_samples_Merged, reduction = "harmony")
head(harmony_embeddings)

# Set the seed for clustering steps
set.seed(123)

# Run UMAP on Harmony embeddings
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:16)

# Set the seed for clustering steps
set.seed(123)

# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
All_samples_Merged <- FindNeighbors(All_samples_Merged, reduction = "harmony", dims = 1:16)
All_samples_Merged <- FindClusters(All_samples_Merged, resolution = 0.5)  # Adjust resolution as needed

# Visualize UMAP
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP of Harmony-Integrated Data")


# Visualize UMAP with batch/cell line information
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")


# Visualize UMAP with clusters
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Clustered Data (After Harmony Integration)")

# Visualize specific cell types or other metadata
DimPlot(All_samples_Merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
    ggtitle("UMAP - Cell Types After Harmony Integration")



```


# 12.Save the Seurat object as an Robj file
```{r saveROBJ2, echo=TRUE}

#save(All_samples_Merged, file = "../../../0-IMP-OBJECTS/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration.robj")


```





