1. load libraries
2. Load Seurat Object
#Load Seurat Object merged from cell lines and a control after filtration
load("0-imp_Robj/All_Samples_Merged_with_10x_Azitmuth_Annotated_SCT_HPC_without_harmony_integration_removed_nonCD4cells_from_control_and_Bcells_from_L4_ILC_NK_CD14Mono_no_nCount_nFeatureRegress_ready_for_Harmony_final.robj")
# Display the Seurat object
All_samples_Merged
An object of class Seurat
62926 features across 49360 samples within 6 assays
Active assay: SCT (26174 features, 3000 variable features)
3 layers present: counts, data, scale.data
5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
4 dimensional reductions calculated: integrated_dr, ref.umap, pca, umap
# Visualize before Harmony integration
DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "cell_line",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Cell Line")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "SCT_snn_res.0.5",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Clusters")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "predicted.celltype.l1",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Annotation.l1")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "predicted.celltype.l2",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Annotation.l2")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "predicted.celltype.l3",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Annotation.l3")

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.5)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
B intermediate 0 0 0 0 0 0 2 0 0 0 0 0 0 5 0 0
B memory 8 0 0 0 79 1 31 0 1 117 2 0 1 5 7 0
CD4 CTL 0 0 0 0 0 12 0 0 0 0 0 0 0 0 0 1
CD4 Naive 0 0 0 0 0 518 0 1479 0 0 0 37 0 7 0 1
CD4 Proliferating 5359 2852 2461 5335 3882 0 3203 5 2821 1289 1473 1 51 119 160 0
CD4 TCM 838 268 3316 137 524 4225 110 1837 22 459 45 417 386 149 42 55
CD4 TEM 0 0 1 0 0 61 0 21 0 0 0 1 0 0 0 0
CD8 Proliferating 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0
CD8 TCM 0 16 1 0 0 0 0 0 0 0 0 0 0 0 0 0
CD8 TEM 0 8 1 0 3 0 2 0 0 1 0 0 0 0 0 0
cDC1 0 0 0 0 4 0 2 0 0 0 0 0 0 1 1 0
cDC2 0 0 0 0 3 0 10 0 0 35 1 0 2 0 0 2
dnT 0 1 0 0 1 0 2 0 0 3 0 2 1 5 0 0
HSPC 54 0 0 1 202 0 666 0 482 7 369 0 0 14 10 0
NK Proliferating 4 2785 39 23 227 0 10 0 10 24 1 0 0 28 10 0
Treg 0 0 1 1 2 0 0 0 0 0 0 1 0 40 0 0
3. Perform Harmony Integration
# Perform Harmony integration
All_samples_Merged <- RunHarmony(All_samples_Merged, group.by.vars = "cell_line", theta = 0.5, assay.use = "SCT")
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 2/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony converged after 2 iterations
# Find neighbors using the Harmony reduction (you do not need to specify "reduction" here for FindNeighbors)
All_samples_Merged <- FindNeighbors(All_samples_Merged, dims = 1:16) # Use the first 16 PCs from Harmony integration
Computing nearest neighbor graph
Computing SNN
# Find clusters based on the neighbors found in the Harmony space
All_samples_Merged <- FindClusters(All_samples_Merged, resolution = 0.2) # Clustering based on PC space (default)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49360
Number of edges: 1629681
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9766
Number of communities: 12
Elapsed time: 19 seconds
# Run UMAP on the new Harmony reduction
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:16, reduction.name = "umap.harmony")
Avis : The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session11:34:47 UMAP embedding parameters a = 0.9922 b = 1.112
11:34:47 Read 49360 rows and found 16 numeric columns
11:34:47 Using Annoy for neighbor search, n_neighbors = 30
11:34:47 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:34:53 Writing NN index file to temp file /tmp/RtmpBM0KyY/file16cb435691e8ba
11:34:53 Searching Annoy index using 1 thread, search_k = 3000
11:35:15 Annoy recall = 100%
11:35:16 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
11:35:20 Initializing from normalized Laplacian + noise (using RSpectra)
11:35:23 Commencing optimization for 200 epochs, with 2089924 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:35:52 Optimization finished
4. Visualize Harmony Integrated Data
# Visualization after Harmony
# By cell line
p3 <- DimPlot(All_samples_Merged,
reduction = "umap.harmony",
group.by = "cell_line",
label = TRUE,
label.box = TRUE) +
ggtitle("After Harmony - By Cell Line")
# By clusters
p4 <- DimPlot(All_samples_Merged,
reduction = "umap.harmony",
group.by = "seurat_clusters",
label = TRUE,
label.box = TRUE) +
ggtitle("After Harmony - By Clusters")
# By cell type annotations
p5 <- DimPlot(All_samples_Merged,
reduction = "umap.harmony",
group.by = "predicted.celltype.l2",
label = TRUE,
label.box = TRUE) +
ggtitle("After Harmony - Cell Type Annotations")
# Print comparison plots
p3 + p4

print(p5)

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "cell_line", label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - By Cell Line")

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "seurat_clusters",label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - By Clusters")

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "predicted.celltype.l2",label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - Annotations")

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)
L1 L2 L3 L4 L5 L6 L7 PBMC PBMC_10x
B intermediate 0 0 2 1 2 2 0 0 0
B memory 0 0 11 1 38 82 120 0 0
CD4 CTL 0 0 0 0 0 0 0 12 1
CD4 Naive 0 0 0 7 0 0 0 523 1512
CD4 Proliferating 2461 2852 5452 5391 4732 4002 4115 0 6
CD4 TCM 3320 270 887 562 178 557 517 4576 1963
CD4 TEM 1 0 0 0 0 0 0 60 23
CD8 Proliferating 0 0 0 0 0 1 1 0 0
CD8 TCM 1 16 0 0 0 0 0 0 0
CD8 TEM 1 8 0 0 2 3 1 0 0
cDC1 0 0 0 0 2 6 0 0 0
cDC2 0 0 0 4 11 3 35 0 0
dnT 2 3 0 1 2 5 2 0 0
HSPC 0 0 60 7 1035 213 490 0 0
NK Proliferating 38 2785 6 24 11 259 38 0 0
Treg 1 1 9 9 4 15 6 0 0
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
0 1 2 3 4 5 6 7 8 9 10 11
B intermediate 0 0 0 0 0 2 3 2 0 0 0 0
B memory 9 1 0 0 118 87 4 30 0 3 0 0
CD4 CTL 0 0 0 0 0 0 12 0 0 0 0 1
CD4 Naive 0 0 0 0 0 0 532 0 1479 0 30 1
CD4 Proliferating 5453 5386 2852 2461 4143 4064 3 3216 5 1428 0 0
CD4 TCM 879 523 268 3319 483 615 4607 109 1837 44 91 55
CD4 TEM 0 0 0 1 0 0 62 0 21 0 0 0
CD8 Proliferating 0 0 0 0 1 1 0 0 0 0 0 0
CD8 TCM 0 0 16 1 0 0 0 0 0 0 0 0
CD8 TEM 0 0 8 1 1 3 0 2 0 0 0 0
cDC1 0 0 0 0 0 6 0 2 0 0 0 0
cDC2 0 2 0 0 36 3 0 10 0 0 0 2
dnT 0 1 1 2 3 5 1 2 0 0 0 0
HSPC 57 1 0 0 490 215 8 672 0 362 0 0
NK Proliferating 6 23 2785 39 34 263 0 10 0 1 0 0
Treg 15 1 0 1 0 25 3 0 0 0 0 0
Visualize Harmony Integrated Data distribution
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)
L1 L2 L3 L4 L5 L6 L7 PBMC PBMC_10x
B intermediate 0 0 2 1 2 2 0 0 0
B memory 0 0 11 1 38 82 120 0 0
CD4 CTL 0 0 0 0 0 0 0 12 1
CD4 Naive 0 0 0 7 0 0 0 523 1512
CD4 Proliferating 2461 2852 5452 5391 4732 4002 4115 0 6
CD4 TCM 3320 270 887 562 178 557 517 4576 1963
CD4 TEM 1 0 0 0 0 0 0 60 23
CD8 Proliferating 0 0 0 0 0 1 1 0 0
CD8 TCM 1 16 0 0 0 0 0 0 0
CD8 TEM 1 8 0 0 2 3 1 0 0
cDC1 0 0 0 0 2 6 0 0 0
cDC2 0 0 0 4 11 3 35 0 0
dnT 2 3 0 1 2 5 2 0 0
HSPC 0 0 60 7 1035 213 490 0 0
NK Proliferating 38 2785 6 24 11 259 38 0 0
Treg 1 1 9 9 4 15 6 0 0
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.2)
0 1 2 3 4 5 6 7 8 9 10 11
B intermediate 0 0 0 0 0 2 3 2 0 0 0 0
B memory 9 1 0 0 118 87 4 30 0 3 0 0
CD4 CTL 0 0 0 0 0 0 12 0 0 0 0 1
CD4 Naive 0 0 0 0 0 0 532 0 1479 0 30 1
CD4 Proliferating 5453 5386 2852 2461 4143 4064 3 3216 5 1428 0 0
CD4 TCM 879 523 268 3319 483 615 4607 109 1837 44 91 55
CD4 TEM 0 0 0 1 0 0 62 0 21 0 0 0
CD8 Proliferating 0 0 0 0 1 1 0 0 0 0 0 0
CD8 TCM 0 0 16 1 0 0 0 0 0 0 0 0
CD8 TEM 0 0 8 1 1 3 0 2 0 0 0 0
cDC1 0 0 0 0 0 6 0 2 0 0 0 0
cDC2 0 2 0 0 36 3 0 10 0 0 0 2
dnT 0 1 1 2 3 5 1 2 0 0 0 0
HSPC 57 1 0 0 490 215 8 672 0 362 0 0
NK Proliferating 6 23 2785 39 34 263 0 10 0 1 0 0
Treg 15 1 0 1 0 25 3 0 0 0 0 0
table(All_samples_Merged$cell_line, All_samples_Merged$SCT_snn_res.0.2)
0 1 2 3 4 5 6 7 8 9 10 11
L1 0 0 0 5815 0 5 5 0 0 0 0 0
L2 0 0 5930 0 0 2 3 0 0 0 0 0
L3 6406 2 0 2 0 6 11 0 0 0 0 0
L4 13 5936 0 1 0 0 55 0 0 0 0 2
L5 0 0 0 1 105 15 3 4055 0 1838 0 0
L6 0 0 0 5 7 5135 1 0 0 0 0 0
L7 0 0 0 1 5196 125 3 0 0 0 0 0
PBMC 0 0 0 0 0 1 5132 0 0 0 3 35
PBMC_10x 0 0 0 0 1 0 22 0 3342 0 118 22
5. Marker Gene Visualization
# Set marker genes specific to requested immune cell types
myfeatures1 <- c("CD19", "CD79A", "MS4A1", # B cells
"CD14", "LYZ", "FCGR3A", # Monocytes
"CSF1R", "CD68", # Macrophages
"NKG7", "GNLY", "KIR3DL1", # NK cells
"MKI67", # Proliferating NK cells
"CD34", "KIT", # HSPCs
"CD3E", "CCR7", # T cells
"SELL", "CD45RO", # Tnaive, Tcm
"CD44", "CD45RA") # Tem, Temra
cd4_feature_plot1 <- FeaturePlot(
All_samples_Merged,
features = myfeatures1,
reduction = "umap.harmony",
ncol = 4
) +
ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
NoLegend()
Avis : Could not find CD45RO in the default search locations, found in 'ADT' assay insteadAvis : Could not find CD45RA in the default search locations, found in 'ADT' assay instead
# Display the plot
print(cd4_feature_plot1)

# Define markers specific to CD4 T cells and their subsets
cd4_markers <- c(
"CD4", # General CD4 T cells
"IL7R", # Naive T cells
"CCR7", # T central memory (Tcm) cells
"SELL", # T naive cells
"FOXP3", # Regulatory T cells (Tregs)
"IL2RA", # Activated T cells
"PDCD1", # Exhausted T cells
"LAG3", # Exhausted T cells
"TIGIT", # Exhausted T cells
"GATA3", # Th2 cells
"TBX21", # Th1 cells
"RORC", # Th17 cells
"BCL6" # T follicular helper (Tfh) cells
)
# Visualize marker genes for CD4 T cells
cd4_feature_plot2 <- FeaturePlot(
All_samples_Merged,
features = cd4_markers,
reduction = "umap.harmony",
ncol = 4
) +
ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
NoLegend()
# Display the plot
print(cd4_feature_plot2)

CD4 T Cell Marker Visualization
# Set marker genes specific to CD4 T cell biology and states
cd4_markers <- c(
# Core T cell markers
"CD3E", # T cell marker
"CD4", # CD4 T cell marker
# Naive/Memory markers
"CCR7", # Naive/Central memory
"SELL", # L-selectin, naive marker
"CD27", # Memory marker
"IL7R", # Naive/Memory marker
# Activation/State markers
"IL2RA", # CD25, activation marker
"CD69", # Early activation
"HLA-DRA", # Activation marker
# Exhaustion markers
"PDCD1", # PD-1
"LAG3", # Exhaustion marker
"TIGIT", # Exhaustion marker
# Regulatory T cell markers
"FOXP3", # Treg marker
"IL2RA", # CD25, Treg marker
"CTLA4", # Treg/exhaustion marker
# Effector/Function markers
"IL2", # T cell function
"IFNG", # Th1
"IL4", # Th2
"IL13", # Th2
"IL17A" # Th17
)
# Create feature plots with better visualization
FeaturePlot(All_samples_Merged,
features = cd4_markers,
reduction = "umap.harmony",
ncol = 4,
pt.size = 0.1, # Smaller point size for better resolution
min.cutoff = "q1", # Remove bottom 1% of expression
max.cutoff = "q99", # Remove top 1% of expression
order = TRUE) + # Plot highest expressing cells on top
ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
theme(plot.title = element_text(size = 16, face = "bold")) +
NoLegend()

# Optional: Add violin plots to see expression distribution across clusters
VlnPlot(All_samples_Merged,
features = cd4_markers[1:20], # First 8 markers
stack = TRUE,
flip = TRUE) +
ggtitle("CD4 T Cell Marker Distribution Across Clusters")

NA
NA
FindMarkers
# Idents(All_samples_Merged) <- "seurat_clusters"
#
# markers1 <- FindMarkers (All_samples_Merged,
# ident.1 = c("0","2", "3", "8","1", "5", "6", "9", "10"), #cell lines
# ident.2 = c("4","7"), #CD4 T cells
# assay = "SCT",
# test.use = "MAST",
# latent.vars = c("cell_line")) #batch
#
# markers2 <- FindMarkers(All_samples_Merged,
# ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), #cell lines
# ident.2 = c("4", "7"), # CD4 T cells
# assay = "SCT",
# test.use = "wilcox") # Change to Wilcoxon test
#
#
#
# markers3 <- FindMarkers(All_samples_Merged,
# ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), # cell lines
# ident.2 = c("4", "7"), # CD4 T cells
# assay = "SCT",
# test.use = "MAST") # Use MAST
#
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results.csv", row.names = TRUE)
#
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2.csv", row.names = TRUE)
#
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3.csv", row.names = TRUE)
#
# #Marker1
#
# # Subset cells in each group
# group1_cells <- WhichCells(All_samples_Merged, idents = c("0", "2", "3", "8", "1", "5", "6", "9", "10"))
# group2_cells <- WhichCells(All_samples_Merged, idents = c("4", "7"))
#
# # Extract normalized expression values
# expression_data <- GetAssayData(All_samples_Merged, slot = "data") # log-normalized values
#
# # Calculate mean expression for each group
# group1_mean <- rowMeans(expression_data[, group1_cells])
# group2_mean <- rowMeans(expression_data[, group2_cells])
#
#
# # Add mean expression to markers result
# markers1$mean_expr_group1 <- group1_mean[rownames(markers1)]
# markers1$mean_expr_group2 <- group2_mean[rownames(markers1)]
#
#
# #Marker2
#
# # Add mean expression to markers result
# markers2$mean_expr_group1 <- group1_mean[rownames(markers2)]
# markers2$mean_expr_group2 <- group2_mean[rownames(markers2)]
#
# #Marker3
#
# # Add mean expression to markers result
# markers3$mean_expr_group1 <- group1_mean[rownames(markers3)]
# markers3$mean_expr_group2 <- group2_mean[rownames(markers3)]
#
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results1_with_meanExpression.csv", row.names = TRUE)
#
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2_with_meanExpression.csv", row.names = TRUE)
#
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3_with_meanExpression.csv", row.names = TRUE)
#
#
# #Marker1
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers1$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
#
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers1$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
#
#
# num_significant <- sum(markers1$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
#
#
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers1$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
#
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers1$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
#
# #Marker2
#
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers2$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
#
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers2$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
#
#
# num_significant <- sum(markers2$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
#
#
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers2$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
#
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers2$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
#
#
# #Marker3
#
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers3$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
#
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers3$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
#
#
# num_significant <- sum(markers3$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
#
#
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers3$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
#
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers3$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
#
4. Save the Seurat object as an Robj file
save(All_samples_Merged, file = "0-imp_Robj/Harmony_integrated_CD4Tcells_harmony_integrated.Robj")
