Loading required package: SeuratObject
Loading required package: sp
Attaching package: 'SeuratObject'
The following objects are masked from 'package:base':
intersect, t
── Installed datasets ──────────────────────────────── SeuratData v0.2.2.9001 ──
✔ pbmcref 1.0.0 ✔ pbmcsca 3.0.0
────────────────────────────────────── Key ─────────────────────────────────────
✔ Dataset loaded successfully
❯ Dataset built with a newer version of Seurat than installed
❓ Unknown version of Seurat installed
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ ggplot2 3.5.1 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'magrittr'
The following object is masked from 'package:purrr':
set_names
The following object is masked from 'package:tidyr':
extract
Attaching package: 'dbplyr'
The following objects are masked from 'package:dplyr':
ident, sql
Registered S3 method overwritten by 'SeuratDisk':
method from
as.sparse.H5Group Seurat
Attaching shinyBS
Loading required package: ggraph
Attaching package: 'ggraph'
The following object is masked from 'package:sp':
geometry
#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("../../../0-IMP-OBJECTS/All_Samples_Merged_with_10x_Azitmuth_Annotated.robj")
All_samples_Merged
An object of class Seurat
36752 features across 59355 samples within 5 assays
Active assay: RNA (36601 features, 0 variable features)
2 layers present: data, counts
4 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
2 dimensional reductions calculated: integrated_dr, ref.umap
# Load necessary libraries
library(Seurat)
# Display basic metadata summary
head(All_samples_Merged@meta.data)
# Check if columns such as `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `nUMI`, `ngene`, and any other necessary columns exist
required_columns <- c("orig.ident", "nCount_RNA", "nFeature_RNA", "nUMI", "ngene")
missing_columns <- setdiff(required_columns, colnames(All_samples_Merged@meta.data))
if (length(missing_columns) > 0) {
cat("Missing columns:", paste(missing_columns, collapse = ", "), "\n")
} else {
cat("All required columns are present.\n")
}
All required columns are present.
# Check cell counts and features
cat("Number of cells:", ncol(All_samples_Merged), "\n")
Number of cells: 59355
cat("Number of features:", nrow(All_samples_Merged), "\n")
Number of features: 36601
# Verify that each `orig.ident` label has the correct number of cells
cat("Cell counts per group:\n")
Cell counts per group:
print(table(All_samples_Merged$orig.ident))
L1 L2 L3 L4 L5 L6 L7 PBMC PBMC10x
5825 5935 6428 6150 6022 5148 5331 8354 10162
# Check that the cell IDs are unique (which ensures no issues from merging)
if (any(duplicated(colnames(All_samples_Merged)))) {
cat("Warning: There are duplicated cell IDs.\n")
} else {
cat("Cell IDs are unique.\n")
}
Cell IDs are unique.
# Check the assay consistency for RNA
DefaultAssay(All_samples_Merged) <- "RNA"
# Check dimensions of the RNA counts layer using the new method
cat("Dimensions of the RNA counts layer:", dim(GetAssayData(All_samples_Merged, layer = "counts")), "\n")
Dimensions of the RNA counts layer: 36601 59355
cat("Dimensions of the RNA data layer:", dim(GetAssayData(All_samples_Merged, layer = "data")), "\n")
Dimensions of the RNA data layer: 36601 59355
# Check the ADT assay (optional)
if ("ADT" %in% names(All_samples_Merged@assays)) {
cat("ADT assay is present.\n")
cat("Dimensions of the ADT counts layer:", dim(GetAssayData(All_samples_Merged, assay = "ADT", layer = "counts")), "\n")
} else {
cat("ADT assay is not present.\n")
}
ADT assay is present.
Dimensions of the ADT counts layer: 56 59355
# InstallData("pbmcref")
#
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")
# Remove the percent.mito column
All_samples_Merged$percent.mito <- NULL
# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "cell_line"
All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged,
pattern = "^RP[SL]")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.rb <- replace(as.numeric(All_samples_Merged$percent.rb), is.na(All_samples_Merged$percent.rb), 0)
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
All_samples_Merged[["percent.mt"]] <- PercentageFeatureSet(All_samples_Merged, pattern = "^MT-")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.mt <- replace(as.numeric(All_samples_Merged$percent.mt), is.na(All_samples_Merged$percent.mt), 0)
VlnPlot(All_samples_Merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) &
theme(plot.title = element_text(size=10))
FeatureScatter(All_samples_Merged, feature1 = "percent.mt",
feature2 = "percent.rb")
VlnPlot(All_samples_Merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3)
FeatureScatter(All_samples_Merged,
feature1 = "percent.mt",
feature2 = "percent.rb") +
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "percent.mt")+
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 27417 by 59355
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 453 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 27417 genes
Computing corrected count matrix for 27417 genes
Calculating gene attributes
Wall clock passed: Time difference of 9.935109 mins
Determine variable features
Getting residuals for block 1(of 12) for counts dataset
Getting residuals for block 2(of 12) for counts dataset
Getting residuals for block 3(of 12) for counts dataset
Getting residuals for block 4(of 12) for counts dataset
Getting residuals for block 5(of 12) for counts dataset
Getting residuals for block 6(of 12) for counts dataset
Getting residuals for block 7(of 12) for counts dataset
Getting residuals for block 8(of 12) for counts dataset
Getting residuals for block 9(of 12) for counts dataset
Getting residuals for block 10(of 12) for counts dataset
Getting residuals for block 11(of 12) for counts dataset
Getting residuals for block 12(of 12) for counts dataset
Finished calculating residuals for counts
Set default assay to SCT
Warning: The following features are not present in the object: MLF1IP, not
searching for symbol synonyms
Warning: The following features are not present in the object: FAM64A, HN1, not
searching for symbol synonyms
# Apply SCTransform
All_samples_Merged <- SCTransform(All_samples_Merged,
vars.to.regress = c("percent.rb","percent.mt", "CC.Difference", "cell_line"),
do.scale=TRUE,
do.center=TRUE,
verbose = TRUE)
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 27417 by 59355
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 453 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 27417 genes
Computing corrected count matrix for 27417 genes
Calculating gene attributes
Wall clock passed: Time difference of 8.604432 mins
Determine variable features
Regressing out percent.rb, percent.mt, CC.Difference, cell_line
Centering and scaling data matrix
Getting residuals for block 1(of 12) for counts dataset
Getting residuals for block 2(of 12) for counts dataset
Getting residuals for block 3(of 12) for counts dataset
Getting residuals for block 4(of 12) for counts dataset
Getting residuals for block 5(of 12) for counts dataset
Getting residuals for block 6(of 12) for counts dataset
Getting residuals for block 7(of 12) for counts dataset
Getting residuals for block 8(of 12) for counts dataset
Getting residuals for block 9(of 12) for counts dataset
Getting residuals for block 10(of 12) for counts dataset
Getting residuals for block 11(of 12) for counts dataset
Getting residuals for block 12(of 12) for counts dataset
Regressing out percent.rb, percent.mt, CC.Difference, cell_line
Centering and scaling data matrix
Finished calculating residuals for counts
Set default assay to SCT
Variables_genes <- All_samples_Merged@assays$SCT@var.features
# Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]
# These are now standard steps in the Seurat workflow for visualization and clustering
All_samples_Merged <- RunPCA(All_samples_Merged,
features = Variables_genes_after_exclusion,
do.print = TRUE,
pcs.print = 1:5,
genes.print = 15,
npcs = 50)
PC_ 1
Positive: RPS27, CD247, MALAT1, EVL, ETS1, B2M, CD3E, TLE5, PRKCH, RPS29
LCK, BCL11B, IL2RG, CTSW, LEF1, NKG7, SKAP1, PRF1, LBH, CCND3
CAMK4, TC2N, TBC1D10C, CD3D, RHOH, RORA, TRBC2, LINC00861, TCF7, ABLIM1
Negative: S100A9, LYZ, S100A8, TYROBP, VCAN, FCN1, CST3, CTSS, SPI1, CYBB
IFI30, PLXDC2, ZEB2, AIF1, FOS, PSAP, HCK, MNDA, LRMDA, NCF2
RNF130, RBM47, SERPINA1, RAB31, CD36, CD14, LRP1, ARHGAP26, TBXAS1, CSF3R
PC_ 2
Positive: FCN1, MNDA, CST3, FOS, CSF3R, VCAN, CD36, MS4A6A, CD302, LYZ
PRAM1, CLEC12A, KLF4, LRMDA, PLBD1, CFD, CSTA, AOAH, RBP7, FPR1
CTSS, ASGR1, AC007952.4, CDA, JAML, TSPO, HK3, CPVL, NFE2, FGR
Negative: SOD2, CXCL8, C15orf48, DOCK4, SLC7A11, KYNU, EREG, THBS1, AC025580.2, CXCL5
MMP9, SDC2, MMP14, CXCL3, GLIS3, CXCL1, IL1B, SERPINB2, FTH1, CXCL16
VMO1, CYP27A1, RAB13, EPB41L3, CTSL, ABCA1, NRP1, IRAK2, NINJ1, CYP1B1
PC_ 3
Positive: CD79A, MS4A1, BANK1, AFF3, IGHM, LINC00926, NIBAN3, CD79B, FCRL1, RALGPS2
TCL1A, CD37, CD19, PAX5, HVCN1, FCRLA, CD74, BLNK, GNG7, SPIB
IGHD, ADAM28, RUBCNL, LINC02397, CD22, COBLL1, VPREB3, SWAP70, MEF2C, POU2AF1
Negative: CD3E, GIMAP7, TCF7, FYB1, GIMAP5, GIMAP4, LEF1, IL7R, GIMAP1, LINC00861
SARAF, LCK, IFITM1, TPT1, CD247, ITK, CAMK4, TC2N, CD3D, BCL11B
ITM2B, RPS15A, PRKCH, PCED1B-AS1, AAK1, PTPRC, CD3G, IL32, CD2, PRKCQ-AS1
PC_ 4
Positive: PFN1, TUBA1B, RAN, PRELID1, HSPE1, H2AFZ, CHCHD2, ACTB, ATP5MC3, NME1
RANBP1, NPM1, PPIA, ATP5F1B, HSPD1, SRM, GAPDH, HSP90AA1, UBE2S, SNRPD1
NDUFAB1, PRDX1, CYC1, EIF4A1, TPI1, CYCS, FCER1G, PPP1R14B, ALYREF, NME2
Negative: FOXP1, ARHGAP15, MBNL1, ANKRD44, BACH2, RIPOR2, ZBTB20, TNRC6B, RABGAP1L, MAML2
MALAT1, PRKCA, ARID1B, BCL2, PDE7A, RUNX1, LRBA, NCOA3, PACS1, ZCCHC7
DOCK10, CAMK4, AFF3, ELMO1, FTX, VPS13B, INPP4B, PDE3B, MGAT5, CAMK2D
PC_ 5
Positive: LGALS1, S100A11, S100A4, B2M, S100A6, TMSB4X, IL32, LSP1, SH3BGRL3, TMSB10
CRIP1, LAPTM5, CYBA, IFITM2, MYL6, EMP3, TAGLN2, VIM, CD52, IL2RG
APOBEC3G, FXYD5, RHOC, TNFRSF18, S1PR4, ACTB, IFITM1, S100A10, ITGB7, LGALS3
Negative: NPM1, HSPD1, SRM, HSP90AB1, HSPE1, NME1, HMGA1, HSPA9, RANBP1, HNRNPAB
PRKDC, NME2, NCL, HSP90AA1, SERBP1, PRELID1, RAN, CYC1, VDAC1, CCT8
UBE2S, TOMM40, RBM17, PPP1R14B, MTDH, PHB, MRPL12, H2AFZ, CCT5, CCT6A
# determine dimensionality of the data
ElbowPlot(All_samples_Merged, ndims = 50)
library(ggplot2)
library(RColorBrewer)
# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")
# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI") # Change column name to nUMI
ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) +
geom_col() +
theme_classic() +
geom_text(aes(label = nUMI),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_fill_manual(values = cell_line_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) + # Adjust the title position
ggtitle("Filtered cells per sample") +
xlab("Cell lines") + # Adjust x-axis label
ylab("Frequency") # Adjust y-axis label
print(ncells)
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 12
# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 12
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
All_samples_Merged <- FindNeighbors(All_samples_Merged,
dims = 1:22,
verbose = FALSE)
# understanding resolution
All_samples_Merged <- FindClusters(All_samples_Merged,
resolution = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1,1.2,1.5,2))
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9655
Number of communities: 15
Elapsed time: 34 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9483
Number of communities: 20
Elapsed time: 25 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9363
Number of communities: 21
Elapsed time: 24 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9278
Number of communities: 26
Elapsed time: 21 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9219
Number of communities: 30
Elapsed time: 31 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9159
Number of communities: 32
Elapsed time: 24 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9115
Number of communities: 33
Elapsed time: 24 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9065
Number of communities: 35
Elapsed time: 27 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9021
Number of communities: 36
Elapsed time: 27 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8977
Number of communities: 37
Elapsed time: 25 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8899
Number of communities: 38
Elapsed time: 22 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8775
Number of communities: 40
Elapsed time: 21 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1864624
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8592
Number of communities: 48
Elapsed time: 16 seconds
# non-linear dimensionality reduction --------------
All_samples_Merged <- RunUMAP(All_samples_Merged,
dims = 1:22,
verbose = FALSE)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
# note that you can set `label = TRUE` or use the Label Clusters function to help label
# individual clusters
DimPlot(All_samples_Merged,group.by = "cell_line",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.3",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.4",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.5",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.6",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.7",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.8",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.9",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.1.2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.1.5",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "SCT_snn_res.0.7"
cluster_table <- table(Idents(All_samples_Merged))
barplot(cluster_table, main = "Number of Cells in Each Cluster",
xlab = "Cluster",
ylab = "Number of Cells",
col = rainbow(length(cluster_table)))
print(cluster_table)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
6886 6521 3497 3432 3372 2974 2881 2876 2816 2760 2484 1902 1892 1801 1780 1680
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1596 1319 1153 972 925 785 782 745 446 293 202 169 126 108 90 61
32
29
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
0 1 2 3 4 5 6 7 8 9
ASDC 0 0 0 0 0 0 0 0 0 0
B intermediate 0 2 0 0 2 0 0 0 0 0
B memory 30 0 4 2 19 1 8 184 3 1
B naive 0 1 0 0 0 2 0 0 0 0
CD14 Mono 2 30 1 5 0 2995 2 11 0 0
CD16 Mono 0 0 0 0 0 124 0 0 0 0
CD4 CTL 0 17 0 0 0 0 0 0 0 0
CD4 Naive 0 523 0 1476 0 1 0 0 0 0
CD4 Proliferating 17799 1 1672 14 2786 0 2243 1695 1437 1227
CD4 TCM 1517 4487 3191 1857 95 12 502 787 43 35
CD4 TEM 1 68 0 25 0 0 0 0 0 0
CD8 Naive 0 357 1 997 0 0 0 0 0 0
CD8 Proliferating 0 0 0 0 0 0 0 2 0 0
CD8 TCM 0 282 18 173 1 0 0 0 0 0
CD8 TEM 0 180 5 184 0 1 1 6 0 0
cDC1 0 0 0 0 3 13 0 4 0 0
cDC2 5 0 0 0 4 124 3 39 0 0
dnT 3 35 0 19 0 1 0 4 0 1
gdT 0 26 0 67 0 0 0 0 0 0
HSPC 729 23 9 0 639 0 54 8 353 0
ILC 0 1 0 3 0 0 0 0 0 0
MAIT 0 14 0 224 0 2 0 0 0 0
NK 0 93 0 14 0 4 0 0 0 0
NK Proliferating 2617 1 409 3 16 0 13 36 1 39
NK_CD56bright 0 1 0 3 0 0 0 0 0 0
pDC 0 0 0 0 0 0 0 0 0 0
Plasmablast 0 0 0 0 0 0 0 0 0 0
Platelet 0 2 0 0 0 0 0 0 0 0
Treg 1 196 1 104 0 0 3 4 0 0
10 11 12 13 14 15 16 17 18 19
ASDC 0 0 0 0 0 0 0 0 3 0
B intermediate 439 174 17 2 2 0 56 2 0 0
B memory 157 71 4 0 2 0 34 3 0 0
B naive 467 677 0 4 0 0 41 0 0 0
CD14 Mono 14 0 740 0 0 2 13 6 2 0
CD16 Mono 0 0 2 0 0 0 0 0 0 0
CD4 CTL 0 0 0 0 0 0 0 0 0 0
CD4 Naive 0 0 0 0 0 36 7 0 0 0
CD4 Proliferating 0 0 0 0 135 2 0 0 0 0
CD4 TCM 32 3 32 0 72 174 37 2 0 0
CD4 TEM 0 0 0 0 0 0 0 0 0 0
CD8 Naive 1 0 0 0 0 17 0 0 0 0
CD8 Proliferating 0 0 0 0 0 0 0 0 0 0
CD8 TCM 1 0 0 0 0 2 0 0 0 0
CD8 TEM 1 0 0 9 2 2 0 0 0 0
cDC1 0 0 0 0 1 0 0 21 0 0
cDC2 0 0 0 0 0 0 1 53 0 0
dnT 2 0 2 0 5 10 0 0 0 0
gdT 0 0 0 0 0 0 0 0 0 0
HSPC 5 0 0 0 4 0 9 1 0 0
ILC 1 0 0 0 0 1 1 0 0 0
MAIT 0 0 0 0 0 2 0 0 0 0
NK 1 0 0 419 0 2 1 0 0 0
NK Proliferating 0 0 0 2 30 0 0 0 0 0
NK_CD56bright 0 0 0 10 0 2 0 0 0 0
pDC 0 0 0 0 0 0 0 0 56 0
Plasmablast 19 0 0 0 0 0 0 0 0 0
Platelet 0 0 1 0 0 0 0 0 0 29
Treg 2 0 0 0 31 8 2 1 0 0
clustree(All_samples_Merged, prefix = "SCT_snn_res.")
# InstallData("pbmcref")
#
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
0 1 2 3 4 5 6 7 8 9
ASDC 0 0 0 0 0 0 0 0 0 0
B intermediate 0 2 0 0 2 0 0 0 0 0
B memory 30 0 4 2 19 1 8 184 3 1
B naive 0 1 0 0 0 2 0 0 0 0
CD14 Mono 2 30 1 5 0 2995 2 11 0 0
CD16 Mono 0 0 0 0 0 124 0 0 0 0
CD4 CTL 0 17 0 0 0 0 0 0 0 0
CD4 Naive 0 523 0 1476 0 1 0 0 0 0
CD4 Proliferating 17799 1 1672 14 2786 0 2243 1695 1437 1227
CD4 TCM 1517 4487 3191 1857 95 12 502 787 43 35
CD4 TEM 1 68 0 25 0 0 0 0 0 0
CD8 Naive 0 357 1 997 0 0 0 0 0 0
CD8 Proliferating 0 0 0 0 0 0 0 2 0 0
CD8 TCM 0 282 18 173 1 0 0 0 0 0
CD8 TEM 0 180 5 184 0 1 1 6 0 0
cDC1 0 0 0 0 3 13 0 4 0 0
cDC2 5 0 0 0 4 124 3 39 0 0
dnT 3 35 0 19 0 1 0 4 0 1
gdT 0 26 0 67 0 0 0 0 0 0
HSPC 729 23 9 0 639 0 54 8 353 0
ILC 0 1 0 3 0 0 0 0 0 0
MAIT 0 14 0 224 0 2 0 0 0 0
NK 0 93 0 14 0 4 0 0 0 0
NK Proliferating 2617 1 409 3 16 0 13 36 1 39
NK_CD56bright 0 1 0 3 0 0 0 0 0 0
pDC 0 0 0 0 0 0 0 0 0 0
Plasmablast 0 0 0 0 0 0 0 0 0 0
Platelet 0 2 0 0 0 0 0 0 0 0
Treg 1 196 1 104 0 0 3 4 0 0
10 11 12 13 14 15 16 17 18 19
ASDC 0 0 0 0 0 0 0 0 3 0
B intermediate 439 174 17 2 2 0 56 2 0 0
B memory 157 71 4 0 2 0 34 3 0 0
B naive 467 677 0 4 0 0 41 0 0 0
CD14 Mono 14 0 740 0 0 2 13 6 2 0
CD16 Mono 0 0 2 0 0 0 0 0 0 0
CD4 CTL 0 0 0 0 0 0 0 0 0 0
CD4 Naive 0 0 0 0 0 36 7 0 0 0
CD4 Proliferating 0 0 0 0 135 2 0 0 0 0
CD4 TCM 32 3 32 0 72 174 37 2 0 0
CD4 TEM 0 0 0 0 0 0 0 0 0 0
CD8 Naive 1 0 0 0 0 17 0 0 0 0
CD8 Proliferating 0 0 0 0 0 0 0 0 0 0
CD8 TCM 1 0 0 0 0 2 0 0 0 0
CD8 TEM 1 0 0 9 2 2 0 0 0 0
cDC1 0 0 0 0 1 0 0 21 0 0
cDC2 0 0 0 0 0 0 1 53 0 0
dnT 2 0 2 0 5 10 0 0 0 0
gdT 0 0 0 0 0 0 0 0 0 0
HSPC 5 0 0 0 4 0 9 1 0 0
ILC 1 0 0 0 0 1 1 0 0 0
MAIT 0 0 0 0 0 2 0 0 0 0
NK 1 0 0 419 0 2 1 0 0 0
NK Proliferating 0 0 0 2 30 0 0 0 0 0
NK_CD56bright 0 0 0 10 0 2 0 0 0 0
pDC 0 0 0 0 0 0 0 0 56 0
Plasmablast 19 0 0 0 0 0 0 0 0 0
Platelet 0 0 1 0 0 0 0 0 0 29
Treg 2 0 0 0 31 8 2 1 0 0
# Load required libraries
library(Seurat)
library(harmony)
Loading required package: Rcpp
library(ggplot2)
# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
All_samples_Merged <- RunHarmony(
object = All_samples_Merged,
group.by.vars = "cell_line", # Replace with the metadata column specifying batch or cell line
dims.use = 1:22 # Use the same dimensions as PCA
)
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
Harmony 2/10
Harmony 3/10
Harmony 4/10
Harmony 5/10
Harmony 6/10
Harmony 7/10
Harmony converged after 7 iterations
# Check results in harmony embeddings
harmony_embeddings <- Embeddings(All_samples_Merged, reduction = "harmony")
head(harmony_embeddings)
harmony_1 harmony_2 harmony_3 harmony_4 harmony_5
L1_AAACCTGAGGGCTTCC-1 -10.899604 0.98268070 -1.211556635 -9.570378 3.4448843
L1_AAACCTGGTGCAGGTA-1 7.205780 0.90197182 -2.701097793 -4.076884 6.0727076
L1_AAACCTGGTTAAAGTG-1 7.787954 1.56457296 -0.004752053 1.917375 -1.3764284
L1_AAACCTGTCAGGTAAA-1 -3.292479 0.07839532 2.450843433 5.607831 -2.0588132
L1_AAACCTGTCCCTGACT-1 -5.718660 0.32732459 -3.376279295 -7.244310 0.9586939
L1_AAACCTGTCCTTCAAT-1 8.876686 1.51160819 -2.546328753 -3.149502 2.2219193
harmony_6 harmony_7 harmony_8 harmony_9 harmony_10
L1_AAACCTGAGGGCTTCC-1 1.52754745 0.7577520 2.6205720 -1.1512761 0.11457416
L1_AAACCTGGTGCAGGTA-1 -1.43966294 -1.4481505 -1.4982254 -1.2156148 0.55289126
L1_AAACCTGGTTAAAGTG-1 -2.83995298 -3.0053897 -1.0033163 2.9123403 -6.00543612
L1_AAACCTGTCAGGTAAA-1 -1.79803203 -0.9116608 -0.8367686 0.5780582 -0.85708173
L1_AAACCTGTCCCTGACT-1 -0.05637497 0.9778898 0.9417120 -0.1671093 0.02191816
L1_AAACCTGTCCTTCAAT-1 -1.52566717 -2.7870499 0.3267525 1.6102970 -2.51891261
harmony_11 harmony_12 harmony_13 harmony_14 harmony_15
L1_AAACCTGAGGGCTTCC-1 -0.07379689 -0.6246939 -0.4855363 -1.3979994 -0.17680633
L1_AAACCTGGTGCAGGTA-1 -0.15339604 1.3163446 2.9693185 -0.8084943 -1.51019421
L1_AAACCTGGTTAAAGTG-1 1.43869912 -0.6003076 0.3557978 0.2903468 -0.29979313
L1_AAACCTGTCAGGTAAA-1 0.21677472 0.4279415 -0.1417651 -0.1680727 -0.09229846
L1_AAACCTGTCCCTGACT-1 0.05237633 -1.2698659 -2.6869148 -0.1470600 0.44823829
L1_AAACCTGTCCTTCAAT-1 0.62829403 0.4176351 2.5454951 0.2286750 -2.24089516
harmony_16 harmony_17 harmony_18 harmony_19 harmony_20
L1_AAACCTGAGGGCTTCC-1 -0.7209589 0.2629892 0.5084070 -2.3410178 -1.08232318
L1_AAACCTGGTGCAGGTA-1 -1.1233209 -0.7759366 -0.4556121 0.4036273 -0.33070207
L1_AAACCTGGTTAAAGTG-1 0.4732435 -2.4316126 0.1420740 -0.5584532 -0.12870196
L1_AAACCTGTCAGGTAAA-1 -0.7654886 0.4943027 0.2855212 0.9637187 -0.02475325
L1_AAACCTGTCCCTGACT-1 -1.2658428 1.9271417 2.9475133 -1.0829796 -4.37239826
L1_AAACCTGTCCTTCAAT-1 -1.0074143 -0.3874226 0.4310478 0.8380242 -0.14885963
harmony_21 harmony_22
L1_AAACCTGAGGGCTTCC-1 -1.6315177 -1.2976811
L1_AAACCTGGTGCAGGTA-1 -0.5426686 -0.7734888
L1_AAACCTGGTTAAAGTG-1 -0.9357523 0.6235443
L1_AAACCTGTCAGGTAAA-1 0.8145846 -1.2096757
L1_AAACCTGTCCCTGACT-1 -0.2391519 0.4231619
L1_AAACCTGTCCTTCAAT-1 0.4693971 -0.6337955
# Run UMAP on Harmony embeddings
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:22)
00:10:18 UMAP embedding parameters a = 0.9922 b = 1.112
00:10:18 Read 59355 rows and found 22 numeric columns
00:10:18 Using Annoy for neighbor search, n_neighbors = 30
00:10:18 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
00:10:25 Writing NN index file to temp file /tmp/Rtmpw0VUoZ/file1e2834ce7944e
00:10:25 Searching Annoy index using 1 thread, search_k = 3000
00:10:48 Annoy recall = 100%
00:10:50 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
00:10:55 Initializing from normalized Laplacian + noise (using RSpectra)
00:10:59 Commencing optimization for 200 epochs, with 2586216 positive edges
00:12:15 Optimization finished
# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
All_samples_Merged <- FindNeighbors(All_samples_Merged, reduction = "harmony", dims = 1:22)
Computing nearest neighbor graph
Computing SNN
All_samples_Merged <- FindClusters(All_samples_Merged, resolution = 0.5) # Adjust resolution as needed
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 59355
Number of edges: 1897755
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9188
Number of communities: 27
Elapsed time: 22 seconds
# Visualize UMAP
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP of Harmony-Integrated Data")
# Visualize UMAP with batch/cell line information
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")
# Visualize UMAP with clusters
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Clustered Data (After Harmony Integration)")
# Visualize specific cell types or other metadata
DimPlot(All_samples_Merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Cell Types After Harmony Integration")
save(All_samples_Merged, file = "../../../0-IMP-OBJECTS/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_SCT-regressed-cell_line.robj")