L1_Merged_first_HPC_PC-1:50-5
1. load libraries
2. Load Seurat Object
#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("All_T_cells_Merged_filtered_Mono_using_clusters.Robj")
All_samples_Merged <- filtered_data
All_samples_Merged
## An object of class Seurat
## 62626 features across 46976 samples within 6 assays
## Active assay: SCT (25902 features, 3000 variable features)
## 3 layers present: counts, data, scale.data
## 5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
## 4 dimensional reductions calculated: pca, umap, integrated_dr, ref.umap
3. QC
Idents(object = All_samples_Merged) <- "cell_line"
All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged,
pattern = "^RP[SL]")
VlnPlot(All_samples_Merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) &
theme(plot.title = element_text(size=10))
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'
##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "percent.mito")+
geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'
Assign Cell-Cycle Scores
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms
4. Normalize data
# Apply SCTransform
All_samples_Merged <- SCTransform(All_samples_Merged,
vars.to.regress = c("percent.rb","percent.mito", "CC.Difference"),
verbose = TRUE)
## Running SCTransform on assay: RNA
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 25901 by 46976
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
## Found 484 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 25901 genes
## Computing corrected count matrix for 25901 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 2.289435 mins
## Determine variable features
## Regressing out percent.rb, percent.mito, CC.Difference
## Centering data matrix
## Place corrected count matrix in counts slot
## Warning: Different cells and/or features from existing assay SCT
## Set default assay to SCT
5. Perform PCA
Variables_genes <- All_samples_Merged@assays$SCT@var.features
# Exclude genes starting with "HLA-" or "Xist"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]
# These are now standard steps in the Seurat workflow for visualization and clustering
All_samples_Merged <- RunPCA(All_samples_Merged,
features = Variables_genes_after_exclusion,
do.print = TRUE,
pcs.print = 1:5,
genes.print = 15,
npcs = 50)
## PC_ 1
## Positive: CCL17, TNFRSF4, CA2, SYT4, MIR155HG, SEC11C, EGFL6, C12orf75, IL2RA, CA10
## CCL5, IGHE, KRT7, PRG4, LTA, STC1, TIGIT, CD74, EEF1A2, ALOX5AP
## THY1, CFI, HDGFL3, MIIP, RANBP17, RXFP1, PHLDA2, ONECUT2, BACE2, HACD1
## Negative: CD7, XCL1, KIR3DL1, MALAT1, XCL2, LTB, KIR2DL3, CST7, CD52, RPS4Y1
## MT1G, KLRC1, IL7R, KIR2DL4, ESYT2, GIMAP7, IFITM1, TMSB4X, IFITM2, ID3
## SH3BGRL3, CXCR3, KRT81, GZMM, KIR3DL2, KRT86, MYO1E, CLEC2B, KLF2, KLRK1
## PC_ 2
## Positive: CCL17, XCL1, CD7, KIR3DL1, XCL2, LTB, CST7, MT1G, KLRC1, KIR2DL4
## CA2, KIR2DL3, TNFRSF4, PLPP1, SPINT2, KRT81, CYBA, MATK, GZMM, KRT86
## ESYT2, HIST1H1B, MYO1E, EPCAM, SYT4, TRGV2, CORO1B, HIST1H4C, CXCR3, NKG7
## Negative: PPBP, CD74, MT2A, PAGE5, CD70, LMNA, TENM3, RPL22L1, LGALS3, STAT1
## RBPMS, CCDC50, B2M, FABP5, IQCG, GSTP1, PPP2R2B, ANXA1, MACROD2, SPOCK1
## CTAG2, PIM2, FTL, SLC7A11-AS1, BASP1, GAPDH, LGALS1, VIM, TNFSF10, AHNAK
## PC_ 3
## Positive: RPS4Y1, MALAT1, IL7R, BTG1, PNRC1, CCL17, LINC00861, TCF7, GIMAP7, SELL
## SARAF, B2M, GIMAP5, PIK3IP1, ZFP36, FTH1, KLF2, TRBC2, CCR7, SESN3
## YPEL3, PCED1B-AS1, CCL5, TRBC1, GIMAP4, PABPC1, RGCC, ZFP36L2, FYB1, ITM2B
## Negative: PPBP, XCL1, KRT1, GAPDH, CD74, ACTB, KIR3DL1, XCL2, FABP5, MT2A
## RPL22L1, HIST1H4C, RPS2, TUBA1B, TUBB, C1QBP, KIR2DL3, TTC29, CST7, NME2
## GZMA, ACTG1, RPL13, NKG7, RPLP0, RPS15, FTL, RPS4X, RPLP1, PFN1
## PC_ 4
## Positive: CCL17, PPBP, MT2A, CD7, CA2, CCL5, LTA, XCL1, MIR155HG, CD74
## CA10, MGST3, STC1, XCL2, MALAT1, KIR2DL3, RXFP1, FCER2, RANBP17, CFI
## KIR3DL1, AL590550.1, IQCG, RYR2, IGHE, THY1, IL7R, STAT1, MT1G, KLRC1
## Negative: EEF1A2, TNFRSF4, IL2RA, WFDC1, PHLDA2, FN1, MIIP, S100A4, KRT1, HIST1H1C
## S100A11, PXYLP1, RDH10, S100A6, DUSP4, GPAT3, TIGIT, CDKN1A, LGALS1, HOXC9
## TNFRSF18, CORO1B, GATA3, AL136456.1, CEP135, EGLN3, HIST1H2BK, TP73, PTGDR2, TMEM163
## PC_ 5
## Positive: PPBP, RPS4Y1, FABP5, GSTP1, CD7, ENPP2, DNAJC12, AC068672.2, MGST1, IL7R
## CSMD1, LINC00861, SLC7A11-AS1, TCF7, FCER2, IL2RA, RDH10, CCDC50, EEF1A2, FAM162A
## HSP90B1, HSPD1, HSPE1, C1QBP, MIIP, SELL, EIF5A, PPID, SPINK6, FTH1
## Negative: S100A4, MT2A, GZMA, LGALS3, CD74, KRT1, CCL17, S100A6, GZMB, NKG7
## CCL1, IL32, CSF2, SERPINE1, TNFSF10, NCR3, CCL4, TSC22D3, TTC29, VIM
## PTGIS, MAL, SH3BGRL3, AC114977.1, CD52, RYR2, S100A11, CYP1B1, LMNA, PLD1
Perform PCA TEST
library(ggplot2)
library(RColorBrewer)
# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")
# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI") # Change column name to nUMI
ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) +
geom_col() +
theme_classic() +
geom_text(aes(label = nUMI),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_fill_manual(values = cell_line_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) + # Adjust the title position
ggtitle("Filtered cells per sample") +
xlab("Cell lines") + # Adjust x-axis label
ylab("Frequency") # Adjust y-axis label
print(ncells)
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
## [1] 21
# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
## [1] 21
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
6. Clustering
All_samples_Merged <- FindNeighbors(All_samples_Merged,
dims = 1:50,
verbose = FALSE)
# understanding resolution
All_samples_Merged <- FindClusters(All_samples_Merged,
resolution = c(0.4,0.5, 0.6, 0.7,0.8, 0.9, 1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9549
## Number of communities: 17
## Elapsed time: 7 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9465
## Number of communities: 20
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9392
## Number of communities: 23
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9322
## Number of communities: 24
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9255
## Number of communities: 27
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9189
## Number of communities: 29
## Elapsed time: 8 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 46976
## Number of edges: 1681420
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9135
## Number of communities: 30
## Elapsed time: 8 seconds
# non-linear dimensionality reduction --------------
All_samples_Merged <- RunUMAP(All_samples_Merged,
dims = 1:50,
verbose = FALSE)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(All_samples_Merged,group.by = "cell_line",
reduction = "umap",
label.size = 3,
repel = T,
label = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.7",
reduction = "umap",
label.size = 3,
repel = T,
label = T)
cluster_table <- table(Idents(All_samples_Merged))
barplot(cluster_table, main = "Number of Cells in Each Cluster",
xlab = "Cluster",
ylab = "Number of Cells",
col = rainbow(length(cluster_table)))
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 4849 4152 3897 3695 3370 3214 2900 2561 2421 2177 2004 1842 1716 1223 827 792
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 650 636 528 518 517 510 499 346 278 245 207 197 112 93
7. Azimuth Annotation
## Warning: The following packages are already installed and will not be
## reinstalled: pbmcref
# The RunAzimuth function can take a Seurat object as input
All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from HUMAN with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Normalizing query using reference SCT model
## Warning: 113 features of the features specified were not present in both the reference query assays.
## Continuing with remaining 4887 features.
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
## Found 4803 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Predicting cell labels
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
##
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 12:27:05 Read 46976 rows
## 12:27:05 Processing block 1 of 1
## 12:27:05 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 12:27:06 Initializing by weighted average of neighbor coordinates using 1 thread
## 12:27:06 Commencing optimization for 67 epochs, with 939520 positive edges
## 12:27:10 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
## Finding neighbors of original query cells
## Finding neighbors of transformed query cells
## Computing query SNN
## Determining bandwidth and computing transition probabilities
## Total elapsed time: 21.6214122772217
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
8. Cell type annotation using ProjectTils
#Load reference atlas and query data
ref <- readRDS(file = "../../8-Cell_Lines_Test/CD4T_human_ref_v1.rds")
#Run Projection algorithm
query.projected <- Run.ProjecTILs(All_samples_Merged, ref = ref)
## | | | 0%[1] "Using assay SCT for query"
## Pre-filtering cells with scGate...
##
## ### Detected a total of 26819 pure 'Target' cells (57.09% of total)
## [1] "20157 out of 46976 ( 43% ) non-pure cells removed. Use filter.cells=FALSE to avoid pre-filtering"
## [1] "Aligning query to reference map for batch-correction..."
## Warning: Layer counts isn't present in the assay object[[assay]]; returning
## NULL
## Preparing PCA embeddings for objects...
## Warning: Number of dimensions changing from 50 to 20
##
## Projecting corrected query onto Reference PCA space
##
## Projecting corrected query onto Reference UMAP space
## Warning: Not all features provided are in this Assay object, removing the
## following feature(s): CD177, TASL, H1-4, H2AZ1, ELAPOR1, CCL3L3, POLR1F, AHSP,
## H1-2, H1-0, WARS1, H1-3, H2BC11, GPX1, H2AC6, IRAG2, H1-10, H3C10, IL22, ECEL1,
## H4C3
## | |======================================================================| 100%
## Creating slots functional.cluster and functional.cluster.conf in query object
#Plot the predicted composition of the query in terms of reference T cell subtypes
plot.statepred.composition(ref, query.projected, metric = "Percent")
## | | | 0%[1] "Using assay SCT for query"
## Pre-filtering cells with scGate...
##
## ### Detected a total of 26819 pure 'Target' cells (57.09% of total)
## [1] "20157 out of 46976 ( 43% ) non-pure cells removed. Use filter.cells=FALSE to avoid pre-filtering"
## [1] "Aligning query to reference map for batch-correction..."
## Warning: Layer counts isn't present in the assay object[[assay]]; returning
## NULL
## Preparing PCA embeddings for objects...
## Warning: Number of dimensions changing from 50 to 20
##
## Projecting corrected query onto Reference PCA space
##
## Projecting corrected query onto Reference UMAP space
## Warning: Not all features provided are in this Assay object, removing the
## following feature(s): CD177, TASL, H1-4, H2AZ1, ELAPOR1, CCL3L3, POLR1F, AHSP,
## H1-2, H1-0, WARS1, H1-3, H2BC11, GPX1, H2AC6, IRAG2, H1-10, H3C10, IL22, ECEL1,
## H4C3
## | |======================================================================| 100%
## Creating slots functional.cluster and functional.cluster.conf in query object
DimPlot(All_samples_Merged, group.by = "functional.cluster",
reduction = "umap",
label.size = 3,
repel = T,
label = T)
clusTree
## Loading required package: ggraph
##
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
##
## geometry
Azimuth Visualization
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## ASDC 45 0 0 0 0 26 6 0 1 8 0 0
## B intermediate 3 0 0 1 2 0 0 1 0 0 0 0
## B memory 0 0 2 0 0 0 0 0 0 0 1 0
## CD4 CTL 0 0 0 0 0 0 0 0 0 0 0 6
## CD4 Naive 0 0 0 741 0 0 0 0 0 0 0 1
## CD4 Proliferating 5217 2895 2416 0 3669 3843 2885 2921 1342 1019 1198 0
## CD4 TCM 442 287 2814 4205 62 325 65 5 7 171 6 45
## CD4 TEM 0 0 11 17 0 0 0 0 0 0 0 31
## CD8 Naive 1 0 0 49 1 146 14 0 1 41 0 325
## CD8 Proliferating 0 0 0 0 0 0 0 0 0 0 0 0
## CD8 TCM 0 14 434 33 0 0 0 0 0 0 0 219
## CD8 TEM 0 1 0 1 0 0 0 0 0 0 0 206
## cDC2 456 0 1 0 827 36 201 84 95 435 7 0
## dnT 0 0 3 26 0 0 0 0 0 0 0 3
## gdT 0 0 0 0 0 0 0 0 0 0 0 13
## HSPC 40 0 1 3 180 0 646 424 395 0 1 0
## ILC 0 0 0 1 0 0 0 0 0 0 0 0
## MAIT 0 0 0 12 0 0 0 0 0 0 0 46
## NK 0 0 0 0 0 0 0 0 0 0 0 91
## NK Proliferating 4 2725 17 0 207 38 11 8 1 21 5 0
## Platelet 0 0 0 1 0 0 0 0 0 0 0 0
## Treg 0 0 1 159 0 0 0 0 0 0 0 0
##
## 12 13 14 15 16
## ASDC 0 0 0 3 0
## B intermediate 5 35 1 1 1
## B memory 0 1 0 0 0
## CD4 CTL 0 0 0 0 0
## CD4 Naive 0 5 0 0 0
## CD4 Proliferating 355 124 230 152 87
## CD4 TCM 3 126 1 7 79
## CD4 TEM 0 0 0 0 0
## CD8 Naive 0 3 1 1 0
## CD8 Proliferating 0 1 0 0 0
## CD8 TCM 0 1 0 0 17
## CD8 TEM 0 0 0 0 0
## cDC2 75 9 7 36 13
## dnT 0 22 0 0 0
## gdT 0 0 0 0 0
## HSPC 46 4 0 2 0
## ILC 0 0 0 0 0
## MAIT 0 0 0 0 0
## NK 0 1 0 0 0
## NK Proliferating 10 22 0 1 0
## Platelet 0 1 0 0 0
## Treg 0 33 0 1 0