Loading required package: SeuratObject
Loading required package: sp
Attaching package: 'SeuratObject'
The following objects are masked from 'package:base':
intersect, t
── Installed datasets ──────────────────────────────── SeuratData v0.2.2.9001 ──
✔ pbmcref 1.0.0 ✔ pbmcsca 3.0.0
────────────────────────────────────── Key ─────────────────────────────────────
✔ Dataset loaded successfully
❯ Dataset built with a newer version of Seurat than installed
❓ Unknown version of Seurat installed
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ ggplot2 3.5.1 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'magrittr'
The following object is masked from 'package:purrr':
set_names
The following object is masked from 'package:tidyr':
extract
Attaching package: 'dbplyr'
The following objects are masked from 'package:dplyr':
ident, sql
Registered S3 method overwritten by 'SeuratDisk':
method from
as.sparse.H5Group Seurat
Attaching shinyBS
Loading required package: ggraph
Attaching package: 'ggraph'
The following object is masked from 'package:sp':
geometry
#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("0-imp_Robj/SS_CD4_Tcells_Azimuth_Annotated_PBMC10x_excluding_nonCD4_cells_from_Control_Bcells_from_L4_and_ILC_NK_just_oneCell.robj")
All_samples_Merged <- filtered_seurat
# Load necessary libraries
library(Seurat)
# Display basic metadata summary
head(All_samples_Merged@meta.data)
# Check if columns such as `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `nUMI`, `ngene`, and any other necessary columns exist
required_columns <- c("orig.ident", "nCount_RNA", "nFeature_RNA", "nUMI", "ngene")
missing_columns <- setdiff(required_columns, colnames(All_samples_Merged@meta.data))
if (length(missing_columns) > 0) {
cat("Missing columns:", paste(missing_columns, collapse = ", "), "\n")
} else {
cat("All required columns are present.\n")
}
All required columns are present.
# Check cell counts and features
cat("Number of cells:", ncol(All_samples_Merged), "\n")
Number of cells: 49386
cat("Number of features:", nrow(All_samples_Merged), "\n")
Number of features: 26179
# Verify that each `orig.ident` label has the correct number of cells
cat("Cell counts per group:\n")
Cell counts per group:
print(table(All_samples_Merged$orig.ident))
L1 L2 L3 L4 L5 L6 L7 PBMC PBMC10x
5825 5935 6428 6021 6022 5148 5331 5171 3505
# Check that the cell IDs are unique (which ensures no issues from merging)
if (any(duplicated(colnames(All_samples_Merged)))) {
cat("Warning: There are duplicated cell IDs.\n")
} else {
cat("Cell IDs are unique.\n")
}
Cell IDs are unique.
# Check the assay consistency for RNA
DefaultAssay(All_samples_Merged) <- "RNA"
# Check dimensions of the RNA counts layer using the new method
cat("Dimensions of the RNA counts layer:", dim(GetAssayData(All_samples_Merged, layer = "counts")), "\n")
Dimensions of the RNA counts layer: 36601 49386
cat("Dimensions of the RNA data layer:", dim(GetAssayData(All_samples_Merged, layer = "data")), "\n")
Dimensions of the RNA data layer: 36601 49386
# Check the ADT assay (optional)
if ("ADT" %in% names(All_samples_Merged@assays)) {
cat("ADT assay is present.\n")
cat("Dimensions of the ADT counts layer:", dim(GetAssayData(All_samples_Merged, assay = "ADT", layer = "counts")), "\n")
} else {
cat("ADT assay is not present.\n")
}
ADT assay is present.
Dimensions of the ADT counts layer: 56 49386
# InstallData("pbmcref")
#
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")
# Remove the percent.mito column
All_samples_Merged$percent.mito <- NULL
Warning: Cannot find cell-level meta data named percent.mito
# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "cell_line"
All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged,
pattern = "^RP[SL]")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.rb <- replace(as.numeric(All_samples_Merged$percent.rb), is.na(All_samples_Merged$percent.rb), 0)
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
All_samples_Merged[["percent.mt"]] <- PercentageFeatureSet(All_samples_Merged, pattern = "^MT-")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.mt <- replace(as.numeric(All_samples_Merged$percent.mt), is.na(All_samples_Merged$percent.mt), 0)
VlnPlot(All_samples_Merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) &
theme(plot.title = element_text(size=10))
FeatureScatter(All_samples_Merged, feature1 = "percent.mt",
feature2 = "percent.rb")
VlnPlot(All_samples_Merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3)
FeatureScatter(All_samples_Merged,
feature1 = "percent.mt",
feature2 = "percent.rb") +
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "percent.mt")+
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
FeatureScatter(All_samples_Merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
`geom_smooth()` using formula = 'y ~ x'
##. Assign Cell-Cycle Scores
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 26179 by 49386
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 478 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 26179 genes
Computing corrected count matrix for 26179 genes
Calculating gene attributes
Wall clock passed: Time difference of 7.871309 mins
Determine variable features
Getting residuals for block 1(of 10) for counts dataset
Getting residuals for block 2(of 10) for counts dataset
Getting residuals for block 3(of 10) for counts dataset
Getting residuals for block 4(of 10) for counts dataset
Getting residuals for block 5(of 10) for counts dataset
Getting residuals for block 6(of 10) for counts dataset
Getting residuals for block 7(of 10) for counts dataset
Getting residuals for block 8(of 10) for counts dataset
Getting residuals for block 9(of 10) for counts dataset
Getting residuals for block 10(of 10) for counts dataset
Finished calculating residuals for counts
Set default assay to SCT
Warning: The following features are not present in the object: MLF1IP, not
searching for symbol synonyms
Warning: The following features are not present in the object: FAM64A, HN1, not
searching for symbol synonyms
# Apply SCTransform
All_samples_Merged <- SCTransform(All_samples_Merged,
vars.to.regress = c("percent.rb","percent.mt", "CC.Difference", "cell_line"),
do.scale=TRUE,
do.center=TRUE,
verbose = TRUE)
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 26179 by 49386
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 478 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 26179 genes
Computing corrected count matrix for 26179 genes
Calculating gene attributes
Wall clock passed: Time difference of 6.329463 mins
Determine variable features
Regressing out percent.rb, percent.mt, CC.Difference, cell_line
Centering and scaling data matrix
Getting residuals for block 1(of 10) for counts dataset
Getting residuals for block 2(of 10) for counts dataset
Getting residuals for block 3(of 10) for counts dataset
Getting residuals for block 4(of 10) for counts dataset
Getting residuals for block 5(of 10) for counts dataset
Getting residuals for block 6(of 10) for counts dataset
Getting residuals for block 7(of 10) for counts dataset
Getting residuals for block 8(of 10) for counts dataset
Getting residuals for block 9(of 10) for counts dataset
Getting residuals for block 10(of 10) for counts dataset
Regressing out percent.rb, percent.mt, CC.Difference, cell_line
Centering and scaling data matrix
Finished calculating residuals for counts
Set default assay to SCT
Variables_genes <- All_samples_Merged@assays$SCT@var.features
# Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]
# Set the seed for clustering steps
set.seed(123)
# These are now standard steps in the Seurat workflow for visualization and clustering
All_samples_Merged <- RunPCA(All_samples_Merged,
features = Variables_genes_after_exclusion,
do.print = TRUE,
pcs.print = 1:5,
genes.print = 15,
npcs = 50)
PC_ 1
Positive: S100A6, S100A11, S100A4, LGALS1, B2M, IL32, LSP1, SH3BGRL3, CRIP1, TMSB4X
LAPTM5, TMSB10, S100A10, VIM, FXYD5, EMP3, IFITM2, IL2RG, CD52, S1PR4
TAGLN2, TIMP1, IFITM1, TNFRSF18, APOBEC3G, LGALS3, CYBA, OPTN, MYL6, CDKN1A
Negative: NPM1, HSPD1, HSP90AB1, SRM, HSPE1, NME1, HMGA1, PRELID1, RAN, HSP90AA1
NCL, HNRNPAB, NME2, HSPA9, SERBP1, CYC1, PPP1R14B, TUBA1B, UBE2S, TOMM40
H2AFZ, VDAC1, CCT8, MRPL12, ODC1, ATP5F1B, SNRPD1, ATP5MC3, MTDH, RBM17
PC_ 2
Positive: ACTB, PFN1, TUBA1B, CLIC1, PPIA, TMSB4X, EIF4A1, CHCHD2, H2AFZ, RAN
HMGN2, B2M, TUBA4A, MYL6, IL32, TMSB10, SH3BGRL3, ACTG1, EIF5A, SLC9A3R1
TUBB4B, HMGB2, STMN1, COX6A1, S100A4, TPI1, PSMB6, IFITM2, PSMB8, COTL1
Negative: MBNL1, PRKCA, RABGAP1L, ARHGAP15, ELMO1, CAMK4, DENND4A, LRBA, RUNX1, NCALD
GRAMD1B, MAML2, RAD51B, FTX, BCL2, WWOX, ATXN1, FOXP1, VPS13B, PDE7A
INPP4B, ZBTB20, DOCK10, CDKAL1, SIK3, NCOA3, NEAT1, ARID1B, TSHZ2, PTPRJ
PC_ 3
Positive: RRM2, HIST1H4C, STMN1, TYMS, HIST1H1E, TUBB, PCLAF, TOP2A, NUSAP1, TK1
HMGB2, MKI67, HIST1H1D, PKMYT1, ATAD2, TUBA1B, H2AFX, H2AFZ, HIST1H1C, DUT
HIST1H1B, KIFC1, SMC4, NEIL3, LMNB1, HIST1H1A, DHFR, TUBA4A, MXD3, PCNA
Negative: TNFSF9, IQCG, SQSTM1, CCL1, SERPINE1, CCL3, ANKRD33B, TRAF1, CFLAR, IL4I1
AC114977.1, KLF6, CD40, AC104365.1, GZMB, PMAIP1, CCL4, SPAG9, HSP90AB1, JUNB
CSF2, TNFAIP3, DUSP4, HSPA8, CCR7, CD82, SRGN, RGS1, LYST, LMNA
PC_ 4
Positive: HIST1H4C, HIST1H1E, RRM2, MKI67, HERC5, OASL, HIST1H1B, TRAF1, PMAIP1, IFIT3
ATAD2, NUSAP1, TOP2A, TUBB, NFKB2, DIAPH3, IFIT2, DENND4A, WARS, HIST1H1D
UBE2Z, TP63, HIST1H1C, EZH2, TYMS, CCL5, IFIH1, NSD2, ARHGAP10, PCLAF
Negative: FOXP1, RIPOR2, LEF1, BCL11B, MAML2, SERINC5, PLCL1, PITPNC1, IGF1R, ZBTB20
ARHGAP15, BACH2, PRKCH, ATP10A, UBE2S, KLF12, TXK, FHIT, PTTG1, CCND3
PACS1, PRKCA, DANCR, MLLT3, HSPE1, TMEM131L, TC2N, NPM1, FAM117B, CDC20
PC_ 5
Positive: GSTP1, TPRG1, FOXP1, BATF3, CAVIN3, MAML2, CCL5, PDE4DIP, CLIC2, MGST3
RPL26, CSMD1, RIPOR2, LINC02406, AC096577.1, NPDC1, NFKB2, ARL14EPL, PHGDH, ACADVL
LEF1, SNHG29, ANK3, OASL, PLCL1, C1orf162, PFN1, AL122017.1, AL590550.1, CCDC50
Negative: RPL35A, LRP1B, AC097518.2, MACROD2, GPR160, AHNAK, RNF213, SLC7A11-AS1, SPOCK1, HGF
AC114930.1, GM2A, ITGA4, PRLR, PIM2, TENM3, RPL37, LINC02694, RYR1, NKAIN2
JPT1, AGMO, AC010967.1, ERC2, FTL, NCAM2, HNRNPAB, AC243829.2, JAG1, NETO2
# determine dimensionality of the data
ElbowPlot(All_samples_Merged, ndims = 50)
library(ggplot2)
library(RColorBrewer)
# Assuming you have 10 different cell lines, generating a color palette with 10 colors
cell_line_colors <- brewer.pal(10, "Set3")
# Assuming All_samples_Merged$cell_line is a factor or character vector containing cell line names
data <- as.data.frame(table(All_samples_Merged$cell_line))
colnames(data) <- c("cell_line", "nUMI") # Change column name to nUMI
ncells <- ggplot(data, aes(x = cell_line, y = nUMI, fill = cell_line)) +
geom_col() +
theme_classic() +
geom_text(aes(label = nUMI),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_fill_manual(values = cell_line_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) + # Adjust the title position
ggtitle("Filtered cells per sample") +
xlab("Cell lines") + # Adjust x-axis label
ylab("Frequency") # Adjust y-axis label
print(ncells)
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- All_samples_Merged[["pca"]]@stdev / sum(All_samples_Merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 9
# TEST-2
# get significant PCs
stdv <- All_samples_Merged[["pca"]]@stdev
sum.stdv <- sum(All_samples_Merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 9
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
Warning in min(percent.stdv[percent.stdv > 5]): no non-missing arguments to
min; returning Inf
# Set the seed for clustering steps
set.seed(123)
All_samples_Merged <- FindNeighbors(All_samples_Merged,
dims = 1:16,
verbose = FALSE)
# understanding resolution
All_samples_Merged <- FindClusters(All_samples_Merged,
resolution = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1,1.2,1.5,2))
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9514
Number of communities: 10
Elapsed time: 25 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9342
Number of communities: 14
Elapsed time: 19 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9234
Number of communities: 16
Elapsed time: 16 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9153
Number of communities: 19
Elapsed time: 19 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9073
Number of communities: 22
Elapsed time: 20 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9016
Number of communities: 23
Elapsed time: 15 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8961
Number of communities: 23
Elapsed time: 16 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8901
Number of communities: 24
Elapsed time: 14 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8842
Number of communities: 23
Elapsed time: 15 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8784
Number of communities: 25
Elapsed time: 14 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8687
Number of communities: 30
Elapsed time: 16 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8550
Number of communities: 32
Elapsed time: 14 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1511040
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8355
Number of communities: 37
Elapsed time: 13 seconds
# Set the seed for clustering steps
set.seed(123)
# non-linear dimensionality reduction --------------
All_samples_Merged <- RunUMAP(All_samples_Merged,
dims = 1:16,
verbose = FALSE)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
# note that you can set `label = TRUE` or use the Label Clusters function to help label
# individual clusters
DimPlot(All_samples_Merged,group.by = "cell_line",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.3",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.4",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.5",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.6",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.7",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.8",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.0.9",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.1.2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.1.5",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged,
group.by = "SCT_snn_res.2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "SCT_snn_res.0.9"
cluster_table <- table(Idents(All_samples_Merged))
barplot(cluster_table, main = "Number of Cells in Each Cluster",
xlab = "Cluster",
ylab = "Number of Cells",
col = rainbow(length(cluster_table)))
print(cluster_table)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
5213 4891 4822 3518 2977 2719 2614 2448 2331 2248 2214 2154 2067 1931 1854 1481
16 17 18 19 20 21 22
1405 868 757 513 228 70 63
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.1)
0 1 2 3 4 5 6 7 8 9
B intermediate 0 0 2 0 0 0 0 2 0 3
B memory 17 0 204 8 2 0 3 16 0 2
CD14 Mono 0 0 11 2 0 0 0 0 0 13
CD4 CTL 0 11 0 0 1 0 0 0 0 1
CD4 Naive 4 1975 15 0 1 0 0 1 38 8
CD4 Proliferating 14090 324 6331 3910 1422 905 1441 588 0 0
CD4 TCM 1420 6217 1087 529 2191 1018 51 77 147 93
CD4 TEM 3 80 0 0 1 0 0 0 0 0
CD8 Proliferating 0 0 2 0 0 0 0 0 0 0
CD8 TCM 15 0 0 0 2 0 0 0 0 0
CD8 TEM 8 0 5 1 1 0 0 0 0 0
cDC1 0 0 5 0 0 0 0 3 0 0
cDC2 1 0 43 3 0 0 1 3 0 2
dnT 4 0 9 0 1 0 0 0 1 0
HSPC 203 9 530 658 0 0 353 44 0 8
NK Proliferating 2715 115 266 7 30 14 1 13 0 0
Treg 20 1 15 3 0 1 2 1 0 2
clustree(All_samples_Merged, prefix = "SCT_snn_res.")
# InstallData("pbmcref")
#
# # The RunAzimuth function can take a Seurat object as input
# All_samples_Merged <- RunAzimuth(All_samples_Merged, reference = "pbmcref")
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = F)
DimPlot(All_samples_Merged, group.by = "predicted.celltype.l2",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
0 1 2 3 4 5 6 7 8 9
B intermediate 0 0 0 0 2 0 0 0 0 0
B memory 14 0 19 2 185 6 4 1 0 3
CD14 Mono 0 0 0 0 11 0 2 0 0 0
CD4 CTL 0 11 0 1 0 0 0 0 0 0
CD4 Naive 4 1975 5 1 0 0 0 0 0 0
CD4 Proliferating 12574 328 4458 1432 1982 2310 2309 682 905 1442
CD4 TCM 1350 5813 292 2203 766 26 522 25 1014 51
CD4 TEM 1 79 0 1 0 0 0 2 0 0
CD8 Proliferating 0 0 0 0 2 0 0 0 0 0
CD8 TCM 15 0 0 2 0 0 0 0 0 0
CD8 TEM 6 0 3 1 4 0 1 0 0 0
cDC1 0 0 1 0 4 0 0 0 0 0
cDC2 0 0 2 0 41 1 2 0 0 1
dnT 3 0 6 1 4 0 0 0 0 0
HSPC 202 9 519 1 11 603 55 0 0 353
NK Proliferating 1325 118 219 32 49 8 12 1370 14 1
Treg 20 0 11 0 3 0 4 1 1 2
10 11 12 13
B intermediate 2 0 0 3
B memory 16 0 0 2
CD14 Mono 0 0 0 13
CD4 CTL 0 0 0 1
CD4 Naive 1 8 40 8
CD4 Proliferating 589 0 0 0
CD4 TCM 78 428 169 93
CD4 TEM 0 1 0 0
CD8 Proliferating 0 0 0 0
CD8 TCM 0 0 0 0
CD8 TEM 0 0 0 0
cDC1 3 0 0 0
cDC2 4 0 0 2
dnT 0 0 1 0
HSPC 44 0 0 8
NK Proliferating 13 0 0 0
Treg 1 0 0 2
#save(All_samples_Merged, file = "0-imp_Robj/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration_removed_nonCD4cells_from_control_and_Bcells_from_L4_ILC_NK_oneCell.robj")
# Load required libraries
library(Seurat)
library(harmony)
Loading required package: Rcpp
library(ggplot2)
# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
All_samples_Merged <- RunHarmony(
object = All_samples_Merged,
group.by.vars = "cell_line", # Replace with the metadata column specifying batch or cell line
)
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
Harmony 2/10
Harmony 3/10
Harmony 4/10
Harmony 5/10
Harmony converged after 5 iterations
# Check results in harmony embeddings
harmony_embeddings <- Embeddings(All_samples_Merged, reduction = "harmony")
head(harmony_embeddings)
harmony_1 harmony_2 harmony_3 harmony_4 harmony_5
L1_AAACCTGAGGGCTTCC-1 10.088223 -10.371394 -0.8305962 -0.5312549 1.7719974
L1_AAACCTGGTGCAGGTA-1 2.088653 -1.106084 -2.3231431 -2.4220913 -1.1199939
L1_AAACCTGGTTAAAGTG-1 -4.055330 3.007435 4.4424632 4.9942649 -1.6570142
L1_AAACCTGTCAGGTAAA-1 -2.612462 4.631311 0.6572374 1.1174345 0.7869959
L1_AAACCTGTCCCTGACT-1 5.677687 -8.299305 -0.4473130 0.8434805 2.3619541
L1_AAACCTGTCCTTCAAT-1 -2.391914 -3.886521 3.6777785 0.6684270 -3.4582755
harmony_6 harmony_7 harmony_8 harmony_9 harmony_10
L1_AAACCTGAGGGCTTCC-1 -1.3518703 0.5779017 0.82357261 0.05147958 -2.99820359
L1_AAACCTGGTGCAGGTA-1 -2.7738142 0.4930619 -0.94718130 -1.98288894 -0.29306582
L1_AAACCTGGTTAAAGTG-1 -0.5944073 0.1148602 0.13390567 -0.73955121 0.15302131
L1_AAACCTGTCAGGTAAA-1 0.9763336 0.9952336 -0.02484311 0.75790183 0.85443947
L1_AAACCTGTCCCTGACT-1 0.9771370 1.3723480 1.42741234 3.26999473 -2.04305499
L1_AAACCTGTCCTTCAAT-1 -3.4076926 1.2014147 -0.78175609 -0.96658940 0.03120191
harmony_11 harmony_12 harmony_13 harmony_14 harmony_15
L1_AAACCTGAGGGCTTCC-1 -0.8574223 1.1703863 2.00140154 -4.7869566 -4.0508491
L1_AAACCTGGTGCAGGTA-1 -0.7167047 0.5687011 0.43668419 -6.2715610 -0.9043011
L1_AAACCTGGTTAAAGTG-1 -1.8352070 -0.4407739 0.06923516 -1.6740228 -2.6545548
L1_AAACCTGTCAGGTAAA-1 1.0850785 -1.6901231 -0.84860407 0.9767801 1.0890113
L1_AAACCTGTCCCTGACT-1 -5.8168123 1.7142966 2.23989138 0.2608488 -1.6417914
L1_AAACCTGTCCTTCAAT-1 -1.3032645 0.6204413 -0.24156627 -3.0718909 -4.0629621
harmony_16 harmony_17 harmony_18 harmony_19 harmony_20
L1_AAACCTGAGGGCTTCC-1 -2.0161826 -0.7302338 -0.9766915 2.00252908 -1.5726583
L1_AAACCTGGTGCAGGTA-1 0.9712578 0.9052659 -1.4443073 0.03939291 0.5223135
L1_AAACCTGGTTAAAGTG-1 1.1804339 -0.2822484 -1.7806902 0.43388346 -0.5937077
L1_AAACCTGTCAGGTAAA-1 -1.3192626 0.6847828 -0.4169068 2.40137199 -1.0788731
L1_AAACCTGTCCCTGACT-1 -3.4420952 0.9651918 1.6350785 -1.14518312 0.4382207
L1_AAACCTGTCCTTCAAT-1 -2.6843688 -0.3501152 0.9737310 2.03611186 -1.5418044
harmony_21 harmony_22 harmony_23 harmony_24 harmony_25
L1_AAACCTGAGGGCTTCC-1 -0.1513534 -0.01486177 -1.2325019 -0.59192481 0.7382205
L1_AAACCTGGTGCAGGTA-1 5.5004035 -4.52311858 1.0133617 2.47982266 -2.2140198
L1_AAACCTGGTTAAAGTG-1 -3.4491206 1.85661047 -0.1131090 1.08392143 2.9182263
L1_AAACCTGTCAGGTAAA-1 1.0475900 -1.76838719 0.4233872 -0.09113039 -0.1209769
L1_AAACCTGTCCCTGACT-1 -4.7168502 4.05681623 -0.9904769 -1.28958874 0.5201374
L1_AAACCTGTCCTTCAAT-1 -1.4168634 -1.79046253 0.6019319 -2.02043078 0.5110636
harmony_26 harmony_27 harmony_28 harmony_29 harmony_30
L1_AAACCTGAGGGCTTCC-1 1.7431473 0.4636544 -0.22805744 0.3282561 -1.0339478
L1_AAACCTGGTGCAGGTA-1 -2.1371578 -0.1579083 3.02244773 2.3242378 -0.6730285
L1_AAACCTGGTTAAAGTG-1 1.3370499 -0.8496521 -0.09835919 -0.2197836 1.1656783
L1_AAACCTGTCAGGTAAA-1 1.2054119 0.8191877 -0.25683835 0.4581030 -1.7503764
L1_AAACCTGTCCCTGACT-1 0.7790093 -0.1472161 -1.13119986 -1.6958833 1.2246155
L1_AAACCTGTCCTTCAAT-1 -0.1394028 -0.2558101 -2.43375948 -1.1090352 -0.9156106
harmony_31 harmony_32 harmony_33 harmony_34
L1_AAACCTGAGGGCTTCC-1 -1.0026494 -0.93976448 0.65114289 1.024752280
L1_AAACCTGGTGCAGGTA-1 1.9242400 -0.87994608 -0.70505701 -1.862335377
L1_AAACCTGGTTAAAGTG-1 0.4933749 -0.06009371 0.69184997 -0.007219396
L1_AAACCTGTCAGGTAAA-1 -1.5163120 -1.11258095 -0.49555221 1.905602139
L1_AAACCTGTCCCTGACT-1 0.9148532 0.76473325 -0.63151771 0.150570066
L1_AAACCTGTCCTTCAAT-1 2.2634311 -2.41162645 0.08872674 -0.469676748
harmony_35 harmony_36 harmony_37 harmony_38
L1_AAACCTGAGGGCTTCC-1 2.2810455638 1.27146632 2.2679117 0.80224177
L1_AAACCTGGTGCAGGTA-1 -1.0506604846 -0.05483587 -1.4209155 0.06316705
L1_AAACCTGGTTAAAGTG-1 0.0008099394 0.01828348 0.3548007 -0.11709098
L1_AAACCTGTCAGGTAAA-1 0.9654023029 0.14970848 0.3320155 -1.58390727
L1_AAACCTGTCCCTGACT-1 1.9499683123 -0.12477692 1.3686675 0.11213882
L1_AAACCTGTCCTTCAAT-1 1.2740525885 -0.04239262 1.7282816 0.79007883
harmony_39 harmony_40 harmony_41 harmony_42 harmony_43
L1_AAACCTGAGGGCTTCC-1 -0.2229225 0.4169214 -0.283488641 0.7880048 1.0589024
L1_AAACCTGGTGCAGGTA-1 0.6131196 -0.8280645 0.656373269 0.2257520 0.5478604
L1_AAACCTGGTTAAAGTG-1 0.6134737 2.1779965 -1.127383495 -0.9094494 -1.5960250
L1_AAACCTGTCAGGTAAA-1 0.8413799 2.4499470 0.001922744 -0.1837016 -1.7771632
L1_AAACCTGTCCCTGACT-1 -2.4560710 0.2570944 -0.088538258 -0.1430335 0.6939479
L1_AAACCTGTCCTTCAAT-1 -0.7133223 2.3010322 -1.292397619 -1.2299753 -0.6381889
harmony_44 harmony_45 harmony_46 harmony_47 harmony_48
L1_AAACCTGAGGGCTTCC-1 -0.4649912 -0.5930716 0.5870768 -2.3091362 -1.6070719
L1_AAACCTGGTGCAGGTA-1 -0.2959463 -1.4389494 0.7371836 0.5603264 -0.3117767
L1_AAACCTGGTTAAAGTG-1 -1.4775035 -0.5181570 -0.4135170 1.0734510 0.5226686
L1_AAACCTGTCAGGTAAA-1 0.1554511 -0.2900890 2.9879476 1.1565070 0.3339082
L1_AAACCTGTCCCTGACT-1 -1.0883826 -0.4095025 0.3712944 -1.5448920 0.6724090
L1_AAACCTGTCCTTCAAT-1 -0.7324704 1.7899085 1.6007186 0.3975862 -1.2309389
harmony_49 harmony_50
L1_AAACCTGAGGGCTTCC-1 -0.02108137 -0.3507751
L1_AAACCTGGTGCAGGTA-1 1.11264625 1.7908485
L1_AAACCTGGTTAAAGTG-1 -2.05858525 -2.2694319
L1_AAACCTGTCAGGTAAA-1 -1.72673986 -3.4194218
L1_AAACCTGTCCCTGACT-1 0.07740715 0.3109355
L1_AAACCTGTCCTTCAAT-1 -1.03514366 1.2647421
# Set the seed for clustering steps
set.seed(123)
# Run UMAP on Harmony embeddings
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:16)
15:58:51 UMAP embedding parameters a = 0.9922 b = 1.112
15:58:51 Read 49386 rows and found 16 numeric columns
15:58:51 Using Annoy for neighbor search, n_neighbors = 30
15:58:51 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:58:56 Writing NN index file to temp file /tmp/RtmpCA1OKc/file2fd1d16e9df6a
15:58:56 Searching Annoy index using 1 thread, search_k = 3000
15:59:16 Annoy recall = 100%
15:59:18 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:59:23 Initializing from normalized Laplacian + noise (using RSpectra)
15:59:25 Commencing optimization for 200 epochs, with 2096862 positive edges
16:00:27 Optimization finished
# Set the seed for clustering steps
set.seed(123)
# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
All_samples_Merged <- FindNeighbors(All_samples_Merged, reduction = "harmony", dims = 1:16)
Computing nearest neighbor graph
Computing SNN
All_samples_Merged <- FindClusters(All_samples_Merged, resolution = 0.5) # Adjust resolution as needed
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49386
Number of edges: 1325187
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8485
Number of communities: 17
Elapsed time: 16 seconds
# Visualize UMAP
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP of Harmony-Integrated Data")
# Visualize UMAP with batch/cell line information
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")
# Visualize UMAP with clusters
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Clustered Data (After Harmony Integration)")
# Visualize specific cell types or other metadata
DimPlot(All_samples_Merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Cell Types After Harmony Integration")
#save(All_samples_Merged, file = "../../../0-IMP-OBJECTS/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration.robj")