1. load libraries
2. Load Seurat Object
#Load Seurat Object merged from cell lines and a control after filtration
load("../0-R_Objects/CD4Tcells_SCTnormalized_done_on_HPC_inluding_Patient_origin.robj")
# Visualize before Harmony integration
DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "Patient_origin",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Cell Line")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "cell_line",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Cell Line")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "SCT_snn_res.0.5",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Clusters")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "predicted.celltype.l1",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Annotation.l1")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "predicted.celltype.l2",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Annotation.l2")

DimPlot(All_samples_Merged,
reduction = "umap",
group.by = "predicted.celltype.l3",
label = TRUE,
label.box = TRUE) +
ggtitle("Before Harmony - By Annotation.l3")

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.5)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
B intermediate 0 3 0 0 0 0 2 0 0 0 0 0 2 0
B memory 8 6 1 0 85 0 30 2 0 115 4 0 1 0
CD14 Mono 0 1 0 0 0 0 4 0 0 7 0 0 0 0
CD4 CTL 0 0 0 0 0 12 0 0 0 0 0 0 0 1
CD4 Naive 0 8 0 0 0 517 0 0 1479 0 0 37 0 1
CD4 Proliferating 5448 2474 5388 2852 3954 0 3256 2863 6 1270 1407 0 93 0
CD4 TCM 871 3414 522 269 536 4214 106 29 1838 457 46 425 49 54
CD4 TEM 0 1 0 0 0 61 0 0 21 0 0 1 0 0
CD8 Proliferating 0 0 0 0 1 0 0 0 0 1 0 0 0 0
CD8 TCM 0 1 0 16 0 0 0 0 0 0 0 0 0 0
CD8 TEM 0 1 0 8 3 0 2 0 0 1 0 0 0 0
cDC1 0 0 0 0 5 0 2 0 0 0 0 0 1 0
cDC2 0 1 2 0 3 0 10 0 0 36 0 0 0 1
dnT 0 3 1 1 1 0 2 0 0 3 0 1 3 0
HSPC 57 10 1 0 211 0 678 483 0 5 358 0 2 0
NK Proliferating 4 40 23 2785 237 0 10 12 0 22 1 0 27 0
Treg 15 14 1 0 1 0 0 0 0 0 0 1 13 0
3. Perform Harmony Integration
# Perform Harmony integration
All_samples_Merged <- RunHarmony(All_samples_Merged,
group.by.vars = c("Patient_origin", "cell_line"),
reduction.use = "pca",
dim.use = 1:15,
theta = c(0.5, 0.5),
assay.use = "SCT")
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 2/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony converged after 2 iterations
# Find neighbors using the Harmony reduction (you do not need to specify "reduction" here for FindNeighbors)
All_samples_Merged <- FindNeighbors(All_samples_Merged,reduction = "harmony", dims = 1:15) # Use the first 16 PCs from Harmony integration
Computing nearest neighbor graph
Computing SNN
# Find clusters based on the neighbors found in the Harmony space
All_samples_Merged <- FindClusters(All_samples_Merged, reduction = "harmony", resolution = c(0.5)) # Clustering based on PC space (default)
Avis : The following arguments are not used: reductionAvis : The following arguments are not used: reduction
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 49372
Number of edges: 1505757
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8883
Number of communities: 13
Elapsed time: 16 seconds
# Run UMAP on the new Harmony reduction
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:15, reduction.name = "umap.harmony")
20:49:13 UMAP embedding parameters a = 0.9922 b = 1.112
20:49:13 Read 49372 rows and found 15 numeric columns
20:49:13 Using Annoy for neighbor search, n_neighbors = 30
20:49:13 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
20:49:19 Writing NN index file to temp file /tmp/RtmpoINXWz/file176a6fac25cbe
20:49:19 Searching Annoy index using 1 thread, search_k = 3000
20:49:40 Annoy recall = 100%
20:49:42 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
20:49:45 Initializing from normalized Laplacian + noise (using RSpectra)
20:49:47 Commencing optimization for 200 epochs, with 2030150 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
20:50:17 Optimization finished
4. Visualize Harmony Integrated Data
# Visualization after Harmony
# By cell line
p3 <- DimPlot(All_samples_Merged,
reduction = "umap.harmony",
group.by = "cell_line",
label = TRUE,
label.box = TRUE) +
ggtitle("After Harmony - By Cell Line")
# By clusters
p4 <- DimPlot(All_samples_Merged,
reduction = "umap.harmony",
group.by = "seurat_clusters",
label = TRUE,
label.box = TRUE) +
ggtitle("After Harmony - By Clusters")
# By cell type annotations
p5 <- DimPlot(All_samples_Merged,
reduction = "umap.harmony",
group.by = "predicted.celltype.l2",
label = TRUE,
label.box = TRUE) +
ggtitle("After Harmony - Cell Type Annotations")
# Print comparison plots
p3 + p4

print(p5)

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "cell_line", label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - By Cell Line")

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "seurat_clusters",label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - By Clusters")

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "predicted.celltype.l2",label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - Annotations")

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)
L1 L2 L3 L4 L5 L6 L7 PBMC PBMC_10x
B intermediate 0 0 2 1 2 2 0 0 0
B memory 0 0 11 1 38 82 120 0 0
CD14 Mono 0 0 1 0 5 0 6 0 0
CD4 CTL 0 0 0 0 0 0 0 12 1
CD4 Naive 0 0 0 7 0 0 0 523 1512
CD4 Proliferating 2461 2852 5452 5391 4732 4002 4115 0 6
CD4 TCM 3320 270 887 562 178 557 517 4576 1963
CD4 TEM 1 0 0 0 0 0 0 60 23
CD8 Proliferating 0 0 0 0 0 1 1 0 0
CD8 TCM 1 16 0 0 0 0 0 0 0
CD8 TEM 1 8 0 0 2 3 1 0 0
cDC1 0 0 0 0 2 6 0 0 0
cDC2 0 0 0 4 11 3 35 0 0
dnT 2 3 0 1 2 5 2 0 0
HSPC 0 0 60 7 1035 213 490 0 0
NK Proliferating 38 2785 6 24 11 259 38 0 0
Treg 1 1 9 9 4 15 6 0 0
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$seurat_clusters)
0 1 2 3 4 5 6 7 8 9 10 11 12
B intermediate 0 0 0 0 2 2 0 0 0 0 0 3 0
B memory 5 1 0 0 18 14 197 4 0 0 0 13 0
CD14 Mono 0 0 1 0 0 0 10 0 0 0 0 0 1
CD4 CTL 0 0 7 2 1 0 2 0 0 0 0 0 1
CD4 Naive 1 0 1478 2 8 49 0 0 2 0 501 0 1
CD4 Proliferating 7905 6741 89 2901 1378 1467 1388 2418 2347 1694 5 678 0
CD4 TCM 930 133 5115 62 2315 1879 1503 33 9 7 725 68 51
CD4 TEM 0 0 78 1 0 4 1 0 0 0 0 0 0
CD8 Proliferating 0 0 0 0 0 0 2 0 0 0 0 0 0
CD8 TCM 1 0 0 1 14 0 0 0 1 0 0 0 0
CD8 TEM 0 0 0 0 5 4 6 0 0 0 0 0 0
cDC1 0 0 0 0 0 1 4 0 0 0 0 3 0
cDC2 0 0 0 0 0 0 47 1 0 0 0 3 2
dnT 0 0 0 0 0 8 7 0 0 0 0 0 0
HSPC 81 996 0 5 43 7 18 621 1 0 0 33 0
NK Proliferating 3 232 26 2709 47 89 15 8 22 2 0 8 0
Treg 0 0 0 0 2 41 2 0 0 0 0 0 0
Visualize Harmony Integrated Data distribution
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)
L1 L2 L3 L4 L5 L6 L7 PBMC PBMC_10x
B intermediate 0 0 2 1 2 2 0 0 0
B memory 0 0 11 1 38 82 120 0 0
CD14 Mono 0 0 1 0 5 0 6 0 0
CD4 CTL 0 0 0 0 0 0 0 12 1
CD4 Naive 0 0 0 7 0 0 0 523 1512
CD4 Proliferating 2461 2852 5452 5391 4732 4002 4115 0 6
CD4 TCM 3320 270 887 562 178 557 517 4576 1963
CD4 TEM 1 0 0 0 0 0 0 60 23
CD8 Proliferating 0 0 0 0 0 1 1 0 0
CD8 TCM 1 16 0 0 0 0 0 0 0
CD8 TEM 1 8 0 0 2 3 1 0 0
cDC1 0 0 0 0 2 6 0 0 0
cDC2 0 0 0 4 11 3 35 0 0
dnT 2 3 0 1 2 5 2 0 0
HSPC 0 0 60 7 1035 213 490 0 0
NK Proliferating 38 2785 6 24 11 259 38 0 0
Treg 1 1 9 9 4 15 6 0 0
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$seurat_clusters)
0 1 2 3 4 5 6 7 8 9 10 11 12
B intermediate 0 0 0 0 2 2 0 0 0 0 0 3 0
B memory 5 1 0 0 18 14 197 4 0 0 0 13 0
CD14 Mono 0 0 1 0 0 0 10 0 0 0 0 0 1
CD4 CTL 0 0 7 2 1 0 2 0 0 0 0 0 1
CD4 Naive 1 0 1478 2 8 49 0 0 2 0 501 0 1
CD4 Proliferating 7905 6741 89 2901 1378 1467 1388 2418 2347 1694 5 678 0
CD4 TCM 930 133 5115 62 2315 1879 1503 33 9 7 725 68 51
CD4 TEM 0 0 78 1 0 4 1 0 0 0 0 0 0
CD8 Proliferating 0 0 0 0 0 0 2 0 0 0 0 0 0
CD8 TCM 1 0 0 1 14 0 0 0 1 0 0 0 0
CD8 TEM 0 0 0 0 5 4 6 0 0 0 0 0 0
cDC1 0 0 0 0 0 1 4 0 0 0 0 3 0
cDC2 0 0 0 0 0 0 47 1 0 0 0 3 2
dnT 0 0 0 0 0 8 7 0 0 0 0 0 0
HSPC 81 996 0 5 43 7 18 621 1 0 0 33 0
NK Proliferating 3 232 26 2709 47 89 15 8 22 2 0 8 0
Treg 0 0 0 0 2 41 2 0 0 0 0 0 0
table(All_samples_Merged$cell_line, All_samples_Merged$seurat_clusters)
0 1 2 3 4 5 6 7 8 9 10 11 12
L1 228 12 45 295 2186 2499 114 1 0 438 2 5 0
L2 59 14 29 4771 913 71 14 5 1 57 0 1 0
L3 5891 20 0 14 187 49 101 11 54 13 0 87 1
L4 2468 3 20 34 115 50 703 3 2289 45 3 272 2
L5 100 1966 6 112 172 36 259 2891 17 173 0 290 0
L6 71 2967 15 158 78 156 1049 17 2 543 2 90 0
L7 107 3120 9 282 141 70 942 157 16 434 1 52 0
PBMC 1 0 4106 17 37 476 14 0 1 0 476 12 31
PBMC_10x 1 1 2564 0 4 158 6 0 2 0 747 0 22
5. Marker Gene Visualization
# Set marker genes specific to requested immune cell types
myfeatures1 <- c("CD19", "CD79A", "MS4A1", # B cells
"CD14", "LYZ", "FCGR3A", # Monocytes
"CSF1R", "CD68", # Macrophages
"NKG7", "GNLY", "KIR3DL1", # NK cells
"MKI67", # Proliferating NK cells
"CD34", "KIT", # HSPCs
"CD3E", "CCR7", # T cells
"SELL", "CD45RO", # Tnaive, Tcm
"CD44", "CD45RA") # Tem, Temra
cd4_feature_plot1 <- FeaturePlot(
All_samples_Merged,
features = myfeatures1,
reduction = "umap.harmony",
ncol = 4
) +
ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
NoLegend()
Avis : Could not find CD45RO in the default search locations, found in 'ADT' assay insteadAvis : Could not find CD45RA in the default search locations, found in 'ADT' assay instead
# Display the plot
print(cd4_feature_plot1)

# Define markers specific to CD4 T cells and their subsets
cd4_markers <- c(
"CD4", # General CD4 T cells
"IL7R", # Naive T cells
"CCR7", # T central memory (Tcm) cells
"SELL", # T naive cells
"FOXP3", # Regulatory T cells (Tregs)
"IL2RA", # Activated T cells
"PDCD1", # Exhausted T cells
"LAG3", # Exhausted T cells
"TIGIT", # Exhausted T cells
"GATA3", # Th2 cells
"TBX21", # Th1 cells
"RORC", # Th17 cells
"BCL6" # T follicular helper (Tfh) cells
)
# Visualize marker genes for CD4 T cells
cd4_feature_plot2 <- FeaturePlot(
All_samples_Merged,
features = cd4_markers,
reduction = "umap.harmony",
ncol = 4
) +
ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
NoLegend()
# Display the plot
print(cd4_feature_plot2)

CD4 T Cell Marker Visualization
# Set marker genes specific to CD4 T cell biology and states
cd4_markers <- c(
# Core T cell markers
"CD3E", # T cell marker
"CD4", # CD4 T cell marker
# Naive/Memory markers
"CCR7", # Naive/Central memory
"SELL", # L-selectin, naive marker
"CD27", # Memory marker
"IL7R", # Naive/Memory marker
# Activation/State markers
"IL2RA", # CD25, activation marker
"CD69", # Early activation
"HLA-DRA", # Activation marker
# Exhaustion markers
"PDCD1", # PD-1
"LAG3", # Exhaustion marker
"TIGIT", # Exhaustion marker
# Regulatory T cell markers
"FOXP3", # Treg marker
"IL2RA", # CD25, Treg marker
"CTLA4", # Treg/exhaustion marker
# Effector/Function markers
"IL2", # T cell function
"IFNG", # Th1
"IL4", # Th2
"IL13", # Th2
"IL17A" # Th17
)
# Create feature plots with better visualization
FeaturePlot(All_samples_Merged,
features = cd4_markers,
reduction = "umap.harmony",
ncol = 4,
pt.size = 0.1, # Smaller point size for better resolution
min.cutoff = "q1", # Remove bottom 1% of expression
max.cutoff = "q99", # Remove top 1% of expression
order = TRUE) + # Plot highest expressing cells on top
ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
theme(plot.title = element_text(size = 16, face = "bold")) +
NoLegend()

# Optional: Add violin plots to see expression distribution across clusters
VlnPlot(All_samples_Merged,
features = cd4_markers[1:20], # First 8 markers
stack = TRUE,
flip = TRUE) +
ggtitle("CD4 T Cell Marker Distribution Across Clusters")

NA
NA
FindMarkers
# Idents(All_samples_Merged) <- "seurat_clusters"
#
# markers1 <- FindMarkers (All_samples_Merged,
# ident.1 = c("0","2", "3", "8","1", "5", "6", "9", "10"), #cell lines
# ident.2 = c("4","7"), #CD4 T cells
# assay = "SCT",
# test.use = "MAST",
# latent.vars = c("cell_line")) #batch
#
# markers2 <- FindMarkers(All_samples_Merged,
# ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), #cell lines
# ident.2 = c("4", "7"), # CD4 T cells
# assay = "SCT",
# test.use = "wilcox") # Change to Wilcoxon test
#
#
#
# markers3 <- FindMarkers(All_samples_Merged,
# ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), # cell lines
# ident.2 = c("4", "7"), # CD4 T cells
# assay = "SCT",
# test.use = "MAST") # Use MAST
#
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results.csv", row.names = TRUE)
#
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2.csv", row.names = TRUE)
#
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3.csv", row.names = TRUE)
#
# #Marker1
#
# # Subset cells in each group
# group1_cells <- WhichCells(All_samples_Merged, idents = c("0", "2", "3", "8", "1", "5", "6", "9", "10"))
# group2_cells <- WhichCells(All_samples_Merged, idents = c("4", "7"))
#
# # Extract normalized expression values
# expression_data <- GetAssayData(All_samples_Merged, slot = "data") # log-normalized values
#
# # Calculate mean expression for each group
# group1_mean <- rowMeans(expression_data[, group1_cells])
# group2_mean <- rowMeans(expression_data[, group2_cells])
#
#
# # Add mean expression to markers result
# markers1$mean_expr_group1 <- group1_mean[rownames(markers1)]
# markers1$mean_expr_group2 <- group2_mean[rownames(markers1)]
#
#
# #Marker2
#
# # Add mean expression to markers result
# markers2$mean_expr_group1 <- group1_mean[rownames(markers2)]
# markers2$mean_expr_group2 <- group2_mean[rownames(markers2)]
#
# #Marker3
#
# # Add mean expression to markers result
# markers3$mean_expr_group1 <- group1_mean[rownames(markers3)]
# markers3$mean_expr_group2 <- group2_mean[rownames(markers3)]
#
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results1_with_meanExpression.csv", row.names = TRUE)
#
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2_with_meanExpression.csv", row.names = TRUE)
#
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3_with_meanExpression.csv", row.names = TRUE)
#
#
# #Marker1
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers1$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
#
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers1$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
#
#
# num_significant <- sum(markers1$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
#
#
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers1$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
#
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers1$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
#
# #Marker2
#
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers2$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
#
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers2$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
#
#
# num_significant <- sum(markers2$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
#
#
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers2$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
#
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers2$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
#
#
# #Marker3
#
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers3$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
#
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers3$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
#
#
# num_significant <- sum(markers3$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
#
#
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers3$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
#
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers3$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
#
4. Save the Seurat object as an Robj file
#save(All_samples_Merged, file = "0-imp_Robj/Harmony_integrated_CD4Tcells_harmony_integrated.Robj")
