Cell Line L5 Analysis

1. load libraries

2. Load Seurat Object

#Load Seurat Object L5
load("../Documents/1-SS-STeps/4-Analysis_and_Robj_Marie/analyse juillet 2023/ObjetsR/L5.Robj")

L5
## An object of class Seurat 
## 36629 features across 6022 samples within 2 assays 
## Active assay: RNA (36601 features, 0 variable features)
##  2 layers present: counts, data
##  1 other assay present: ADT

3. QC

# Set identity classes to an existing column in meta data
Idents(object = L5) <- "cell_line"

L5[["percent.rb"]] <- PercentageFeatureSet(L5, pattern = "^RP[SL]")

VlnPlot(L5, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mito",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))

FeatureScatter(L5, feature1 = "percent.mito", 
                                  feature2 = "percent.rb")

VlnPlot(L5, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mito"), 
                                      ncol = 3)

FeatureScatter(L5, 
               feature1 = "percent.mito", 
               feature2 = "percent.rb") +
        geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

FeatureScatter(L5, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
        geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.

FeatureScatter(L5, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mito")+
  geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

FeatureScatter(L5, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

## Assign Cell-Cycle Scores

## Running SCTransform on assay: RNA
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 18690 by 6022
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
## Found 117 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 18690 genes
## Computing corrected count matrix for 18690 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 23.86308 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Set default assay to SCT
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms

4. Normalize data

# Apply SCTransform
L5 <- SCTransform(L5, vars.to.regress = c("percent.rb","percent.mito", "CC.Difference"), 
                  do.scale=TRUE, 
                  do.center=TRUE, 
                  verbose = TRUE)
## Running SCTransform on assay: RNA
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 18690 by 6022
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
## Found 117 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 18690 genes
## Computing corrected count matrix for 18690 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 18.53435 secs
## Determine variable features
## Regressing out percent.rb, percent.mito, CC.Difference
## Centering and scaling data matrix
## Place corrected count matrix in counts slot
## Set default assay to SCT

5. Perform PCA

Variables_genes <- L5@assays$SCT@var.features

# Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]


# These are now standard steps in the Seurat workflow for visualization and clustering
L5 <- RunPCA(L5,
             features = Variables_genes_after_exclusion,
             do.print = TRUE, 
             pcs.print = 1:5, 
             genes.print = 15,
             npcs = 50)
## PC_ 1 
## Positive:  C1QBP, RPL26, GSTP1, EIF5A, EIF4A1, PFN1, BATF3, MGST3, TPRG1, ALOX5AP 
##     PDE4DIP, LTB, CYC1, NAA38, TXNDC17, YWHAE, C1orf162, PSMB6, NUDT8, NPDC1 
##     HSP90B1, HMGN5, SNHG29, ACADVL, CAVIN3, PRDX1, MRPL47, TESC, NDUFAF3, PHGDH 
## Negative:  RPL35A, RPS23, RPS14, RACK1, RPL37, HINT1, AC097518.2, SERP1, MALAT1, MACROD2 
##     SIAH2, COX7C, GPR160, PIM2, ITGA4, PCSK1N, AC104365.1, SUB1, UBE2D2, FTL 
##     CD70, LGALS3, LRP1B, AHNAK, AGMO, EIF2A, TNFSF10, ANXA5, RPL10, MARCKS 
## PC_ 2 
## Positive:  NPM1, HNRNPAB, HSP90AB1, NCL, RPL23, HSPA9, CCT5, HSPD1, HSPE1, EIF4G1 
##     NME1, PABPC1, ODC1, MTDH, DANCR, TCP1, HNRNPU, NOP16, SFPQ, PPID 
##     HSP90AA1, SRM, NME2, SERBP1, CCT6A, KPNB1, HSPA4, UBE2S, CANX, EIF3B 
## Negative:  B2M, S100A11, S100A6, IL32, LGALS1, IL2RG, TMSB10, S1PR4, S100A10, LSP1 
##     ISG20, TMSB4X, RNASEK, EMP3, S100A4, TAGLN2, HPGD, VIM, S100P, FXYD5 
##     DDIT4, GABARAP, CRIP1, CYBA, PLAAT4, PNRC1, BTG1, PFN1, C12orf75, KLF2 
## PC_ 3 
## Positive:  H3F3A, KLF2, TUBA1B, IL32, ARHGDIB, S1PR4, S100A4, SLC9A3R1, TUBA4A, MT-CO3 
##     RPL17, H2AFZ, HMGB1, HPGD, S100P, IFITM2, CORO1A, TNFRSF18, MRPS12, GMFG 
##     NQO1, IFITM1, CRIP1, STMN1, PSMB8, S100A6, RPL6, RPS7, CALM1, H2AFX 
## Negative:  CCR7, PMAIP1, HERC5, OASL, SPAG9, ZC3HAV1, IFIT2, TRAF1, IFIT3, TNFAIP3 
##     ISG15, CCL5, DENND4A, GRAMD1B, PARP14, PELI1, CAMK4, UBE2Z, TNFSF9, ANKRD33B 
##     EPS15, NCF2, TP63, CXCL10, SAR1A, PLCG2, STAT3, RABGAP1L, GAS5, IFIH1 
## PC_ 4 
## Positive:  ISG15, PPA1, OASL, CCL5, IFIT3, LTA, HERC5, ZBTB32, IFIT2, CDC20 
##     SAR1A, DNAJA1, CSAG3, DYNLL1, TNFSF9, PMAIP1, CCR7, PRDX1, HSPA8, CD70 
##     PRR13, CXCL10, CCNB1, DHX58, TNF, ZC3HAV1, NOP16, NCF2, JPT1, IL4I1 
## Negative:  HIST1H1E, HIST1H4C, HIST1H1B, HIST1H1C, MKI67, MTHFD2, RRM2, CEP128, HIST1H1A, PSAT1 
##     HIST1H1D, ATAD2, SMC1A, PRKDC, MYH9, AARS, MT-ND6, TOP2A, RDH10, NSD2 
##     CTH, GARS, NEIL3, TYMS, LMNB1, SLC7A11, RNF213, HIST1H2AE, SMC4, KIF21B 
## PC_ 5 
## Positive:  HIST1H4C, IFIT3, HERC5, IFIT2, HIST1H1E, OASL, ISG15, H2AFZ, PMAIP1, RRM2 
##     CTH, PSAT1, WARS, UBE2Z, RHEBL1, ZC3HAV1, HIST1H1B, NCF2, CCL5, CD74 
##     ZBTB32, MTHFD2, PARP14, HIST1H1C, CHAC1, DHX58, PCK2, IFIH1, PKMYT1, DNAJC12 
## Negative:  ANKRD37, SLC2A3, LDHA, BNIP3, PGK1, PLIN2, MIF, KIF2A, GPI, HSPA8 
##     GDE1, FABP5, DEGS1, C4orf3, GAPDH, PRDM1, HILPDA, BNIP3L, TPI1, PGAM1 
##     GLUL, ENO1, HIF1A-AS3, UBE2S, PRELID3B, CYTIP, CDC20, ENPP2, AK4, MXI1
# determine dimensionality of the data
ElbowPlot(L5, ndims =50)

# Perform PCA TEST

library(ggplot2)
library(RColorBrewer)  

# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")

# Assuming L5$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(L5$cell_line))
colnames(data) <- c("cell_line", "nUMI")  # Change column name to nUMI

ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nUMI), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25) +
  scale_fill_manual(values = cell_line_colors) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +  # Adjust the title position
  ggtitle("Filtered cells per sample") +
  xlab("Cell lines") +  # Adjust x-axis label
  ylab("Frequency")    # Adjust y-axis label

print(ncells)

# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object

pct <- L5[["pca"]]@stdev / sum(L5[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
## [1] 11
# TEST-2
# get significant PCs
stdv <- L5[["pca"]]@stdev
sum.stdv <- sum(L5[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
## [1] 11
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

6. Clustering

L5 <- FindNeighbors(L5, 
                    dims = 1:min.pc, 
                    verbose = FALSE)

# understanding resolution
L5 <- FindClusters(L5, 
                  resolution = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 
                                0.7,0.8, 0.9, 1, 1.1, 1.2))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9363
## Number of communities: 3
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9001
## Number of communities: 5
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8723
## Number of communities: 6
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8475
## Number of communities: 7
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8280
## Number of communities: 7
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8088
## Number of communities: 8
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7946
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7812
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7687
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7574
## Number of communities: 10
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7450
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6022
## Number of edges: 198822
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7358
## Number of communities: 11
## Elapsed time: 0 seconds
# non-linear dimensionality reduction --------------
L5 <- RunUMAP(L5, 
              dims = 1:min.pc,
              verbose = FALSE)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# note that you can set `label = TRUE` or use the Label Clusters function to help label
# individual clusters
DimPlot(L5,group.by = "cell_line", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.3", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.4", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.5", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.6", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.7", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.8", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.0.9", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.1.1", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5,
        group.by = "SCT_snn_res.1.2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

7. Azimuth Annotation

InstallData("pbmcref")
## Warning: The following packages are already installed and will not be
## reinstalled: pbmcref
# The RunAzimuth function can take a Seurat object as input
L5 <- RunAzimuth(L5, reference = "pbmcref")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from HUMAN with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Normalizing query using reference SCT model
## Warning: 113 features of the features specified were not present in both the reference query assays. 
## Continuing with remaining 4887 features.
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
##  Found 204 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Predicting cell labels
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## 
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 16:07:42 Read 6022 rows
## 16:07:42 Processing block 1 of 1
## 16:07:42 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 16:07:42 Initializing by weighted average of neighbor coordinates using 1 thread
## 16:07:42 Commencing optimization for 67 epochs, with 120440 positive edges
## 16:07:42 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
##     Finding neighbors of original query cells
##     Finding neighbors of transformed query cells
##     Computing query SNN
##     Determining bandwidth and computing transition probabilities
## Total elapsed time: 2.09838366508484
DimPlot(L5, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot (L5, group.by = "predicted.celltype.l2", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T)

8. Cell type annotation using ProjectTils

#Load reference atlas and query data
ref <- readRDS(file = "CD4T_human_ref_v1.rds")

#Run Projection algorithm
query.projected <- Run.ProjecTILs(L5, ref = ref)
##   |                                                                              |                                                                      |   0%[1] "Using assay SCT for query"
## Pre-filtering cells with scGate...
## 
## ### Detected a total of 456 pure 'Target' cells (7.57% of total)
## [1] "5566 out of 6022 ( 92% ) non-pure cells removed. Use filter.cells=FALSE to avoid pre-filtering"
## [1] "Aligning query to reference map for batch-correction..."
## Warning: Layer counts isn't present in the assay object[[assay]]; returning
## NULL
## Preparing PCA embeddings for objects...
## Warning: Number of dimensions changing from 50 to 20
## 
## Projecting corrected query onto Reference PCA space
## 
## Projecting corrected query onto Reference UMAP space
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## Warning: Not all features provided are in this Assay object, removing the
## following feature(s): GZMK, CCL20, CD177, CCL22, CGA, G0S2, IL17A, CCL4L2,
## XCL1, IGFL2, GZMH, TRDC, CSF2, FOXP3, XCL2, IL10, IL1RL1, IL1R2, ACTG2, KLRB1,
## DIRAS3, NPPC, IL17RB, ZBED2, CD7, CD200, CXCR6, HOPX, MS4A6A, LAYN, TYROBP,
## CTSL, TUBA3D, IL2, CRTAM, FCER1G, CST7, PVALB, EOMES, MRC1, TASL, EGR2, CPM,
## TMIGD2, H1-4, METTL7A, ZNF80, IL26, LRRC32, H2AZ1, ACP5, GPR25, TNFSF8, TNS3,
## ELAPOR1, CCL3L3, POLR1F, PDCD1, CXCR5, EGR3, FCRL3, ADTRP, F5, IL1R1, PECAM1,
## AHSP, FAIM2, GIMAP4, HTRA1, CCND1, LIMS2, H1-2, CYSLTR1, H1-0, FLT1, WARS1,
## MATK, H1-3, CAMK1, ASCL2, SIRPG, DTHD1, H2BC11, NELL2, GPX1, STAC, H2AC6,
## IRAG2, CYP7B1, H1-10, MYO7A, FASLG, VAV3, SCML1, CLEC7A, PON2, H3C10, FBLN7,
## FGFBP2, IL22, SLC28A3, PDLIM4, ZNF683, ECEL1, CPE, ARC, NLRP3, H4C3, RORC,
## AUTS2, HS3ST1
##   |                                                                              |======================================================================| 100%
## Creating slots functional.cluster and functional.cluster.conf in query object
#reference atlas
DimPlot(ref, label = T)

#Visualize projection
plot.projection(ref, query.projected, linesize = 0.5, pointsize = 0.5)

#Plot the predicted composition of the query in terms of reference T cell subtypes
plot.statepred.composition(ref, query.projected, metric = "Percent")

L5 <- ProjecTILs.classifier(query = L5, ref = ref)
##   |                                                                              |                                                                      |   0%[1] "Using assay SCT for query"
## Pre-filtering cells with scGate...
## 
## ### Detected a total of 456 pure 'Target' cells (7.57% of total)
## [1] "5566 out of 6022 ( 92% ) non-pure cells removed. Use filter.cells=FALSE to avoid pre-filtering"
## [1] "Aligning query to reference map for batch-correction..."
## Warning: Layer counts isn't present in the assay object[[assay]]; returning
## NULL
## Preparing PCA embeddings for objects...
## Warning: Number of dimensions changing from 50 to 20
## 
## Projecting corrected query onto Reference PCA space
## 
## Projecting corrected query onto Reference UMAP space
## Warning: Not all features provided are in this Assay object, removing the
## following feature(s): GZMK, CCL20, CD177, CCL22, CGA, G0S2, IL17A, CCL4L2,
## XCL1, IGFL2, GZMH, TRDC, CSF2, FOXP3, XCL2, IL10, IL1RL1, IL1R2, ACTG2, KLRB1,
## DIRAS3, NPPC, IL17RB, ZBED2, CD7, CD200, CXCR6, HOPX, MS4A6A, LAYN, TYROBP,
## CTSL, TUBA3D, IL2, CRTAM, FCER1G, CST7, PVALB, EOMES, MRC1, TASL, EGR2, CPM,
## TMIGD2, H1-4, METTL7A, ZNF80, IL26, LRRC32, H2AZ1, ACP5, GPR25, TNFSF8, TNS3,
## ELAPOR1, CCL3L3, POLR1F, PDCD1, CXCR5, EGR3, FCRL3, ADTRP, F5, IL1R1, PECAM1,
## AHSP, FAIM2, GIMAP4, HTRA1, CCND1, LIMS2, H1-2, CYSLTR1, H1-0, FLT1, WARS1,
## MATK, H1-3, CAMK1, ASCL2, SIRPG, DTHD1, H2BC11, NELL2, GPX1, STAC, H2AC6,
## IRAG2, CYP7B1, H1-10, MYO7A, FASLG, VAV3, SCML1, CLEC7A, PON2, H3C10, FBLN7,
## FGFBP2, IL22, SLC28A3, PDLIM4, ZNF683, ECEL1, CPE, ARC, NLRP3, H4C3, RORC,
## AUTS2, HS3ST1
##   |                                                                              |======================================================================| 100%
## Creating slots functional.cluster and functional.cluster.conf in query object
DimPlot(L5, group.by = "functional.cluster", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T)

9.Cell type annotation using SingleR

#get reference datasets from celldex package

monaco.ref <- celldex::MonacoImmuneData()
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
hpca.ref <- celldex::HumanPrimaryCellAtlasData()
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
dice.ref <- celldex::DatabaseImmuneCellExpressionData()
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
bpe.ref <- celldex::BlueprintEncodeData()
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
#convert our Seurat object to single cell experiment (SCE)
sce <- as.SingleCellExperiment(DietSeurat(L5))

#using SingleR
monaco.main <- SingleR(test = sce,assay.type.test = 1,ref = monaco.ref,labels = monaco.ref$label.main)
monaco.fine <- SingleR(test = sce,assay.type.test = 1,ref = monaco.ref,labels = monaco.ref$label.fine)
hpca.main <- SingleR(test = sce,assay.type.test = 1,ref = hpca.ref,labels = hpca.ref$label.main)
hpca.fine <- SingleR(test = sce,assay.type.test = 1,ref = hpca.ref,labels = hpca.ref$label.fine)
dice.main <- SingleR(test = sce,assay.type.test = 1,ref = dice.ref,labels = dice.ref$label.main)
dice.fine <- SingleR(test = sce,assay.type.test = 1,ref = dice.ref,labels = dice.ref$label.fine)
bpe.main <- SingleR(test = sce,assay.type.test = 1,ref = bpe.ref,labels = bpe.ref$label.main)
bpe.fine <- SingleR(test = sce,assay.type.test = 1,ref = bpe.ref,labels = bpe.ref$label.fine)


#summary of general cell type annotations

#table(monaco.main$pruned.labels)
#table(hpca.main$pruned.labels)
#table(dice.main$pruned.labels)
#table(bpe.main$pruned.labels)

#The finer cell types annotations are you after, the harder they are to get reliably. 
#This is where comparing many databases, as well as using individual markers from literature, 
#would all be very valuable.

#table(monaco.fine$pruned.labels)
#table(hpca.fine$pruned.labels)
#table(dice.fine$pruned.labels)
#table(bpe.fine$pruned.labels)



#add the annotations to the Seurat object metadata
L5@meta.data$monaco.main <- monaco.main$pruned.labels
L5@meta.data$monaco.fine <- monaco.fine$pruned.labels
#
L5@meta.data$hpca.main   <- hpca.main$pruned.labels
L5@meta.data$hpca.fine   <- hpca.fine$pruned.labels
#  
L5@meta.data$dice.main   <- dice.main$pruned.labels
L5@meta.data$dice.fine   <- dice.fine$pruned.labels
# 
L5@meta.data$bpe.main   <- bpe.main$pruned.labels
L5@meta.data$bpe.fine   <- bpe.fine$pruned.labels

# Monaco Annotations
DimPlot(L5, group.by = "monaco.main", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(L5, group.by = "monaco.main", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5, group.by = "monaco.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(L5, group.by = "monaco.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

# HPCA Annotations
DimPlot(L5, group.by = "hpca.main", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

DimPlot(L5, group.by = "hpca.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(L5, group.by = "hpca.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# DICE Annotations
DimPlot(L5, group.by = "dice.main", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5, group.by = "dice.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(L5, group.by = "dice.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

# BPE Annotations
DimPlot(L5, group.by = "bpe.main", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

DimPlot(L5, group.by = "bpe.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = F)

DimPlot(L5, group.by = "bpe.fine", 
        reduction = "umap",
        label.size = 3,
        repel = T,
        label = T, label.box = T)

10. clusTree

library(clustree)
## Loading required package: ggraph
## 
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
## 
##     geometry
clustree(L5, prefix = "SCT_snn_res.")

11.Save the Seurat object as an Robj file

save(L5, file = "L5_Analysis.Robj")

```