combined_merged <- readRDS("../../../MyData_Gaydosik_Data_Integration/Poglio_Gaydosik_combined_merged_SCT_Normalized.rds")
# Remove the percent.mito column
combined_merged$percent.mito <- NULL
Warning: Cannot find cell-level meta data named percent.mito
# Set identity classes to an existing column in meta data
Idents(object = combined_merged) <- "batch"
# Add percent ribosomal and mitochondrial content
combined_merged[["percent.rb"]] <- PercentageFeatureSet(combined_merged, pattern = "^RP[SL]")
combined_merged$percent.rb <- replace(as.numeric(combined_merged$percent.rb), is.na(combined_merged$percent.rb), 0)
combined_merged[["percent.mt"]] <- PercentageFeatureSet(combined_merged, pattern = "^MT-")
setwd("~/2025_NewHarmony_Integrated_Files/Year3_Analysis/Biomarkers_Validation_with_Public_Data/Gaydosik_Paper_Data_3_SS_Patients/Malignant_HC_Together_Analysis_for_Validation/Azimuth_Annotation")
combined_merged$percent.mt <- replace(as.numeric(combined_merged$percent.mt), is.na(combined_merged$percent.mt), 0)
# ----------------------------
# Filter high `nCount_RNA` cells
# ----------------------------
# Define an upper threshold (e.g., top 0.5% or absolute cutoff)
ncount_thresh <- quantile(combined_merged$nCount_RNA, 0.995) # top 0.5% outliers
message("Removing cells with nCount_RNA > ", round(ncount_thresh))
Removing cells with nCount_RNA > 32789
# Subset the object to remove both types of outliers
combined_merged <- subset(combined_merged,
subset = nCount_RNA < ncount_thresh & percent.mt < 10)
# ----------------------------
# QC Plots (after filtering)
# ----------------------------
VlnPlot(combined_merged, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10))
Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.
FeatureScatter(combined_merged, feature1 = "percent.mt", feature2 = "percent.rb")
FeatureScatter(combined_merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.
FeatureScatter(combined_merged,
feature1 = "nCount_RNA",
feature2 = "percent.mt")+
geom_smooth(method = 'lm')
FeatureScatter(combined_merged,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
# # Apply SCTransform
# combined_merged <- SCTransform(combined_merged,
# vars.to.regress = c("percent.rb","percent.mt","CC.Difference", "nCount_RNA"),
# assay = "RNA",
# do.scale=TRUE,
# do.center=TRUE,
# verbose = TRUE)
# Variables_genes <- combined_merged@assays$SCT@var.features
#
# # Exclude genes starting with "HLA-" AND "Xist" AND "TRBV, TRAV"
# Variables_genes_after_exclusion <- Variables_genes[!grepl("^HLA-|^XIST|^TRBV|^TRAV", Variables_genes)]
#
# # Set the seed for clustering steps
# set.seed(123)
#
# # These are now standard steps in the Seurat workflow for visualization and clustering
# combined_merged <- RunPCA(combined_merged,
# features = Variables_genes_after_exclusion,
# do.print = TRUE,
# pcs.print = 1:5,
# genes.print = 15,
# npcs = 50)
# determine dimensionality of the data
ElbowPlot(combined_merged, ndims = 50)
library(ggplot2)
library(RColorBrewer)
# Assuming you have 10 different cell lines, generating a color palette with 10 colors
n_batches <- length(unique(data$batch))
batch_colors <- setNames(colorRampPalette(brewer.pal(8, "Set3"))(n_batches), unique(data$batch))
# Assuming combined_merged$batch is a factor or character vector containing cell line names
data <- as.data.frame(table(combined_merged$batch))
colnames(data) <- c("batch", "nUMI") # Change column name to nUMI
ncells <- ggplot(data, aes(x = batch, y = nUMI, fill = batch)) +
geom_col() +
theme_classic() +
geom_text(aes(label = nUMI),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_fill_manual(values = batch_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) + # Adjust the title position
ggtitle("Filtered cells per sample") +
xlab("batch") + # Adjust x-axis label
ylab("Frequency") # Adjust y-axis label
print(ncells)
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- combined_merged[["pca"]]@stdev / sum(combined_merged[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 20
# TEST-2
# get significant PCs
stdv <- combined_merged[["pca"]]@stdev
sum.stdv <- sum(combined_merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 20
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
NA
NA
NA
# # Set the seed for clustering steps
# set.seed(123)
#
# combined_merged <- FindNeighbors(combined_merged,
# dims = 1:min.pc,
# verbose = FALSE)
#
# # understanding resolution
# combined_merged <- FindClusters(combined_merged,
# resolution = c(0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1, 1.2,1.5))
#
# # Set the seed for clustering steps
# set.seed(123)
#
# # non-linear dimensionality reduction --------------
# combined_merged <- RunUMAP(combined_merged,
# dims = 1:min.pc,
# verbose = FALSE)
#
#
# # Define resolution values to plot
# resolutions <- c(0.3, 0.4, 0.5, 0.6, 0.7,0.8, 0.9, 1, 1.2,1.5)
#
# # Loop through and generate DimPlots
# for (res in resolutions) {
# res_col <- paste0("SCT_snn_res.", res)
#
# p <- DimPlot(combined_merged,
# group.by = res_col,
# reduction = "umap",
# label.size = 3,
# repel = TRUE,
# label = TRUE,
# label.box = TRUE) +
# ggtitle(paste("Clustering resolution:", res))
#
# print(p)
# }
#
# Set identity classes to an existing column in meta data
Idents(object = combined_merged) <- "SCT_snn_res.0.4"
cluster_table <- table(Idents(combined_merged))
barplot(cluster_table, main = "Number of Cells in Each Cluster",
xlab = "Cluster",
ylab = "Number of Cells",
col = rainbow(length(cluster_table)))
print(cluster_table)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
9569 7215 6432 6190 4423 5740 5634 5664 5541 5410 5174 2655 1576 1937 213 1386 1429 1179 813 801 798
21 22 23 24 25 26
527 423 358 338 159 80
DimPlot(combined_merged, group.by = "batch",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
library(clustree)
Loading required package: ggraph
Attaching package: 'ggraph'
The following object is masked from 'package:sp':
geometry
clustree(combined_merged, prefix = "SCT_snn_res.")
library(Seurat)
InstallData("pbmcref")
Warning: The following packages are already installed and will not be reinstalled: pbmcref
# The RunAzimuth function can take a Seurat object as input
combined_merged <- RunAzimuth(combined_merged, reference = "pbmcref")
Warning: Overwriting miscellanous data for modelWarning: Adding a dimensional reduction (refUMAP) without the associated assay being presentWarning: Adding a dimensional reduction (refUMAP) without the associated assay being presentdetected inputs from HUMAN with id type Gene.name
reference rownames detected HUMAN with id type Gene.name
Normalizing query using reference SCT model
Warning: 113 features of the features specified were not present in both the reference query assays.
Continuing with remaining 4887 features.Projecting cell embeddings
Error: Cannot add new cells with [[<-
DimPlot(combined_merged, group.by = "predicted.celltype.l1",
reduction = "umap",
label.size = 3,
repel = T,
label = T, label.box = T)
Warning: The following requested variables were not found: predicted.celltype.l1Error in `[.data.frame`(data, , group) : undefined columns selected
saveRDS(combined_merged, file = "Poglio_Gaydosik_combined_merged_SCT_Normalized.rds")
# Load required libraries
library(Seurat)
library(harmony)
library(ggplot2)
# Run Harmony, adjusting for batch effect using "cell_line" or another grouping variable
combined_merged <- RunHarmony(
combined_merged,
group.by.vars = c("batch"), # Replace with the metadata column specifying batch or cell line
assay.use="SCT")
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 2/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 3/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony converged after 3 iterations
# Check results in harmony embeddings
harmony_embeddings <- Embeddings(combined_merged, reduction = "harmony")
head(harmony_embeddings)
harmony_1 harmony_2 harmony_3 harmony_4 harmony_5 harmony_6 harmony_7
L1_AAACCTGAGGGCTTCC-1 -18.4855027 7.9423204 -0.2196979 2.4854855 3.0805502 -2.13201971 -1.5175328
L1_AAACCTGGTGCAGGTA-1 -6.5494297 -4.1619010 0.2539774 0.3868676 -0.2058094 0.86031329 0.6284560
L1_AAACCTGGTTAAAGTG-1 4.5029146 -2.5820435 -1.1373015 0.1770274 -2.1259750 4.60306410 -0.6284321
L1_AAACCTGTCAGGTAAA-1 0.5745898 -0.8535429 0.8603061 1.5044380 4.0526170 1.32483585 -1.7043553
L1_AAACCTGTCCCTGACT-1 -13.2222987 7.3356257 1.8729908 0.2086466 0.4211220 0.03685936 -1.0460933
L1_AAACCTGTCCTTCAAT-1 -0.8538374 -4.8084348 -1.5473780 -1.1453762 -2.9576401 1.61335360 0.3296758
harmony_8 harmony_9 harmony_10 harmony_11 harmony_12 harmony_13 harmony_14
L1_AAACCTGAGGGCTTCC-1 2.8195613 -3.1815283 0.16958309 -5.4855500 4.9764302 0.1773200 0.8632915
L1_AAACCTGGTGCAGGTA-1 3.1016680 3.2600778 -0.52434897 -4.1796631 0.6155530 0.2923108 1.1184826
L1_AAACCTGGTTAAAGTG-1 1.9698514 -1.4398020 0.50992434 -2.2941862 -0.7746350 1.0098298 -1.0810271
L1_AAACCTGTCAGGTAAA-1 -0.3045706 0.3694501 -0.65804743 -0.1578142 -1.2242120 2.2096673 0.2381271
L1_AAACCTGTCCCTGACT-1 2.1340628 -2.0582358 -0.23791341 -2.2020179 3.1349577 -1.3031110 4.0706958
L1_AAACCTGTCCTTCAAT-1 1.5981304 0.1617858 0.01682314 -1.3374615 0.7898118 0.6148349 0.1959089
harmony_15 harmony_16 harmony_17 harmony_18 harmony_19 harmony_20 harmony_21
L1_AAACCTGAGGGCTTCC-1 1.77877175 0.648277318 -1.8146050 -0.60387511 0.7448553 1.0578965 1.297213
L1_AAACCTGGTGCAGGTA-1 -1.25676408 -0.032001930 -1.7105728 0.62126453 2.0266850 1.4024440 -2.900196
L1_AAACCTGGTTAAAGTG-1 -0.03085833 -0.005814651 -0.6493010 -0.72298562 1.9222377 -2.6967548 -0.557706
L1_AAACCTGTCAGGTAAA-1 -0.46478792 -1.207716626 0.5349474 -1.18396607 -0.7381161 -1.1951483 -4.461107
L1_AAACCTGTCCCTGACT-1 1.44715193 -0.066221590 -0.8311151 -1.32487017 0.1419474 1.5816799 0.757584
L1_AAACCTGTCCTTCAAT-1 0.16629524 -0.526111927 -1.0628915 -0.07495768 1.0285287 0.5241696 1.198793
harmony_22 harmony_23 harmony_24 harmony_25 harmony_26 harmony_27 harmony_28
L1_AAACCTGAGGGCTTCC-1 3.1438926 4.0525919 -1.8571393 -1.123777 -1.77661793 0.2440655 -1.6636347
L1_AAACCTGGTGCAGGTA-1 2.7535922 -1.7408459 1.4560267 -1.602397 -0.97356067 -0.1840563 0.7404742
L1_AAACCTGGTTAAAGTG-1 0.3655097 -1.9256351 -3.2008633 -2.588555 2.79950517 3.2863951 -1.3789454
L1_AAACCTGTCAGGTAAA-1 3.2063842 -1.7560083 0.5468997 -1.312635 0.01983461 1.0390326 0.2064140
L1_AAACCTGTCCCTGACT-1 2.1987280 3.4204367 -1.1399827 -1.367466 -1.13131567 1.4423038 -0.8344294
L1_AAACCTGTCCTTCAAT-1 1.3319477 0.3365448 0.5921872 -1.603506 1.51891869 0.6538837 -0.5850538
harmony_29 harmony_30 harmony_31 harmony_32 harmony_33 harmony_34 harmony_35
L1_AAACCTGAGGGCTTCC-1 -0.6985372 0.001992406 -0.03308453 -0.5344455 0.08165714 1.1561033 0.09751095
L1_AAACCTGGTGCAGGTA-1 -1.4326163 -1.421477162 0.42626754 1.1301252 -0.27722996 0.5523950 0.32210353
L1_AAACCTGGTTAAAGTG-1 1.2013954 0.019446755 1.14350897 -0.8957589 -0.13677914 0.7776574 -1.90726729
L1_AAACCTGTCAGGTAAA-1 0.4212270 -0.203393996 0.71283186 1.4282126 0.63228992 -1.8358861 0.92606916
L1_AAACCTGTCCCTGACT-1 -1.1708837 1.457756837 0.08031120 -0.8225988 0.61428860 -1.4715428 0.63390863
L1_AAACCTGTCCTTCAAT-1 -1.4805198 -0.030434591 -1.82280183 0.1143809 1.89139408 -0.4546119 1.13054387
harmony_36 harmony_37 harmony_38 harmony_39 harmony_40 harmony_41 harmony_42
L1_AAACCTGAGGGCTTCC-1 -0.52539264 -0.04873894 -0.5614033 0.8920342 -0.8063888 0.02255221 0.2444488
L1_AAACCTGGTGCAGGTA-1 -0.36683300 -1.59599078 0.1137256 -0.1385294 1.1308844 -0.56512949 1.2670811
L1_AAACCTGGTTAAAGTG-1 -1.45259164 0.47771741 0.2388712 0.7192189 0.4374085 0.15769393 0.5817462
L1_AAACCTGTCAGGTAAA-1 0.01180534 -0.61542923 0.7325384 -0.0920718 2.3385877 -0.62949538 -1.3082965
L1_AAACCTGTCCCTGACT-1 0.70117509 0.82009779 0.7759824 0.2438408 -1.2703849 2.86204005 -0.4627982
L1_AAACCTGTCCTTCAAT-1 -0.90789373 -0.27960555 0.1415306 0.6175270 1.0569101 -0.12742201 -0.6355494
harmony_43 harmony_44 harmony_45 harmony_46 harmony_47 harmony_48 harmony_49
L1_AAACCTGAGGGCTTCC-1 -0.88118369 0.7919337 -1.26483643 -0.25145293 0.2382496 0.4712432 -1.6265200
L1_AAACCTGGTGCAGGTA-1 0.59983144 2.8364486 -1.28550254 -1.37360333 1.2220225 -0.1983266 -1.7725065
L1_AAACCTGGTTAAAGTG-1 0.61253631 -0.9935459 -0.04096519 -0.52000917 -0.7439375 -0.7159783 -2.7579180
L1_AAACCTGTCAGGTAAA-1 -0.03403918 -1.0374222 -2.25264836 -0.86893246 2.6223660 -0.5912721 2.5533740
L1_AAACCTGTCCCTGACT-1 -1.23578994 -1.5286682 -1.49695807 0.05879675 -0.1396169 2.1965935 0.7691668
L1_AAACCTGTCCTTCAAT-1 -0.12479116 0.3548016 -0.45706530 -1.13590570 0.4446526 -1.9508394 -1.0077698
harmony_50
L1_AAACCTGAGGGCTTCC-1 3.7587366
L1_AAACCTGGTGCAGGTA-1 0.6304744
L1_AAACCTGGTTAAAGTG-1 0.3955415
L1_AAACCTGTCAGGTAAA-1 -0.4746698
L1_AAACCTGTCCCTGACT-1 2.6825509
L1_AAACCTGTCCTTCAAT-1 2.7133421
# Set the seed for clustering steps
set.seed(123)
# Run UMAP on Harmony embeddings
combined_merged <- RunUMAP(combined_merged, reduction = "harmony", dims = 1:16)
10:56:18 UMAP embedding parameters a = 0.9922 b = 1.112
10:56:18 Read 81664 rows and found 16 numeric columns
10:56:18 Using Annoy for neighbor search, n_neighbors = 30
10:56:18 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:56:24 Writing NN index file to temp file /tmp/Rtmp2IR4Yl/file8415b17c6fbc3
10:56:24 Searching Annoy index using 1 thread, search_k = 3000
10:56:48 Annoy recall = 100%
10:56:49 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
10:56:51 Initializing from normalized Laplacian + noise (using RSpectra)
10:56:54 Commencing optimization for 200 epochs, with 3520450 positive edges
10:56:54 Using rng type: pcg
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:57:21 Optimization finished
# Set the seed for clustering steps
set.seed(123)
# Optionally, find neighbors and clusters (if you plan to do clustering analysis)
combined_merged <- FindNeighbors(combined_merged, reduction = "harmony", dims = 1:16)
Computing nearest neighbor graph
Computing SNN
combined_merged <- FindClusters(combined_merged, reduction = "harmony", resolution = 0.5) # Adjust resolution as needed
Warning: The following arguments are not used: reductionWarning: The following arguments are not used: reduction
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 81664
Number of edges: 2391723
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9061
Number of communities: 21
Elapsed time: 25 seconds
# Visualize UMAP
DimPlot(combined_merged, reduction = "umap", group.by = "batch", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP of Harmony-Integrated Data")
# Visualize UMAP with batch/cell line information
DimPlot(combined_merged, reduction = "umap", group.by = "batch", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Colored by Cell Line (After Harmony Integration)")
# Visualize UMAP with clusters
DimPlot(combined_merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE, pt.size = 0.5) +
ggtitle("UMAP - Clustered Data (After Harmony Integration)")
# # Visualize specific cell types or other metadata
# DimPlot(combined_merged, reduction = "umap", group.by = "predicted.celltype.l2", label = TRUE, pt.size = 0.5) +
# ggtitle("UMAP - Cell Types After Harmony Integration")
saveRDS(combined_merged, file = "Poglio_Gaydosik_combined_merged_Harmonized.rds")