1. load libraries

2. Load Seurat Object


#Load Seurat Object merged from cell lines and a control after filtration
load("../0-R_Objects/CD4Tcells_SCTnormalized_done_on_HPC_inluding_Patient_origin.robj")



# Visualize before Harmony integration
DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "Patient_origin",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Cell Line")



DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "cell_line",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Cell Line")


DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "SCT_snn_res.0.5",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Clusters")


DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "predicted.celltype.l1",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Annotation.l1")



DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "predicted.celltype.l2",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Annotation.l2")


DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "predicted.celltype.l3",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Annotation.l3")



table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.5)
                   
                       0    1    2    3    4    5    6    7    8    9   10   11   12   13
  B intermediate       0    3    0    0    0    0    2    0    0    0    0    0    2    0
  B memory             8    6    1    0   85    0   30    2    0  115    4    0    1    0
  CD14 Mono            0    1    0    0    0    0    4    0    0    7    0    0    0    0
  CD4 CTL              0    0    0    0    0   12    0    0    0    0    0    0    0    1
  CD4 Naive            0    8    0    0    0  517    0    0 1479    0    0   37    0    1
  CD4 Proliferating 5448 2474 5388 2852 3954    0 3256 2863    6 1270 1407    0   93    0
  CD4 TCM            871 3414  522  269  536 4214  106   29 1838  457   46  425   49   54
  CD4 TEM              0    1    0    0    0   61    0    0   21    0    0    1    0    0
  CD8 Proliferating    0    0    0    0    1    0    0    0    0    1    0    0    0    0
  CD8 TCM              0    1    0   16    0    0    0    0    0    0    0    0    0    0
  CD8 TEM              0    1    0    8    3    0    2    0    0    1    0    0    0    0
  cDC1                 0    0    0    0    5    0    2    0    0    0    0    0    1    0
  cDC2                 0    1    2    0    3    0   10    0    0   36    0    0    0    1
  dnT                  0    3    1    1    1    0    2    0    0    3    0    1    3    0
  HSPC                57   10    1    0  211    0  678  483    0    5  358    0    2    0
  NK Proliferating     4   40   23 2785  237    0   10   12    0   22    1    0   27    0
  Treg                15   14    1    0    1    0    0    0    0    0    0    1   13    0

3. Perform Harmony Integration


# Perform Harmony integration
All_samples_Merged <- RunHarmony(All_samples_Merged, 
                                 group.by.vars = "cell_line", 
                                 reduction.use = "pca", 
                                 dim.use = 1:15,
                                 theta = 0.5, 
                                 assay.use = "SCT")
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 2/10
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony converged after 2 iterations
# Find neighbors using the Harmony reduction (you do not need to specify "reduction" here for FindNeighbors)
All_samples_Merged <- FindNeighbors(All_samples_Merged,reduction = "harmony", dims = 1:15)  # Use the first 16 PCs from Harmony integration
Computing nearest neighbor graph
Computing SNN
# Find clusters based on the neighbors found in the Harmony space
All_samples_Merged <- FindClusters(All_samples_Merged, reduction = "harmony", resolution = 0.2)  # Clustering based on PC space (default)
Avis : The following arguments are not used: reductionAvis : The following arguments are not used: reduction
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 49372
Number of edges: 1541403

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9435
Number of communities: 8
Elapsed time: 16 seconds
# Run UMAP on the new Harmony reduction
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:15, reduction.name = "umap.harmony")
15:04:54 UMAP embedding parameters a = 0.9922 b = 1.112
15:04:54 Read 49372 rows and found 15 numeric columns
15:04:54 Using Annoy for neighbor search, n_neighbors = 30
15:04:54 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:05:00 Writing NN index file to temp file /tmp/RtmpokjsJx/file175cc238e7a0ee
15:05:00 Searching Annoy index using 1 thread, search_k = 3000
15:05:22 Annoy recall = 100%
15:05:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:05:27 Initializing from normalized Laplacian + noise (using RSpectra)
15:05:29 Commencing optimization for 200 epochs, with 2023778 positive edges
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:05:58 Optimization finished

4. Visualize Harmony Integrated Data


# Visualization after Harmony

# By cell line
p3 <- DimPlot(All_samples_Merged, 
              reduction = "umap.harmony", 
              group.by = "cell_line",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("After Harmony - By Cell Line")

# By clusters
p4 <- DimPlot(All_samples_Merged, 
              reduction = "umap.harmony", 
              group.by = "seurat_clusters",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("After Harmony - By Clusters")

# By cell type annotations
p5 <- DimPlot(All_samples_Merged, 
              reduction = "umap.harmony", 
              group.by = "predicted.celltype.l2",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("After Harmony - Cell Type Annotations")

# Print comparison plots
p3 + p4

print(p5)


DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "cell_line", label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - By Cell Line")

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "seurat_clusters",label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - By Clusters")

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "predicted.celltype.l2",label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - Annotations")


table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)
                   
                      L1   L2   L3   L4   L5   L6   L7 PBMC PBMC_10x
  B intermediate       0    0    2    1    2    2    0    0        0
  B memory             0    0   11    1   38   82  120    0        0
  CD14 Mono            0    0    1    0    5    0    6    0        0
  CD4 CTL              0    0    0    0    0    0    0   12        1
  CD4 Naive            0    0    0    7    0    0    0  523     1512
  CD4 Proliferating 2461 2852 5452 5391 4732 4002 4115    0        6
  CD4 TCM           3320  270  887  562  178  557  517 4576     1963
  CD4 TEM              1    0    0    0    0    0    0   60       23
  CD8 Proliferating    0    0    0    0    0    1    1    0        0
  CD8 TCM              1   16    0    0    0    0    0    0        0
  CD8 TEM              1    8    0    0    2    3    1    0        0
  cDC1                 0    0    0    0    2    6    0    0        0
  cDC2                 0    0    0    4   11    3   35    0        0
  dnT                  2    3    0    1    2    5    2    0        0
  HSPC                 0    0   60    7 1035  213  490    0        0
  NK Proliferating    38 2785    6   24   11  259   38    0        0
  Treg                 1    1    9    9    4   15    6    0        0
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$seurat_clusters)
                   
                        0     1     2     3     4     5     6     7
  B intermediate        0     0     0     4     0     0     3     0
  B memory            199     4     0    28     0     6    15     0
  CD14 Mono            11     0     0     0     0     0     0     1
  CD4 CTL               0     0     9     1     2     0     0     1
  CD4 Naive             0     0  1990    49     2     0     0     1
  CD4 Proliferating 10219 10418   130  2336  2782  2493   633     0
  CD4 TCM            1440  1033  6207  3854   138    43    64    51
  CD4 TEM               0     0    82     1     1     0     0     0
  CD8 Proliferating     2     0     0     0     0     0     0     0
  CD8 TCM               0     1     0    13     3     0     0     0
  CD8 TEM               6     0     0     7     2     0     0     0
  cDC1                  4     0     0     1     0     0     3     0
  cDC2                 48     0     0     0     0     1     2     2
  dnT                   7     0     0     8     0     0     0     0
  HSPC               1039    58     0    25     1   649    33     0
  NK Proliferating    270    25    24    87  2739     7     9     0
  Treg                  2     0     1    42     0     0     0     0

Visualize Harmony Integrated Data distribution



table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)
                   
                      L1   L2   L3   L4   L5   L6   L7 PBMC PBMC_10x
  B intermediate       0    0    2    1    2    2    0    0        0
  B memory             0    0   11    1   38   82  120    0        0
  CD14 Mono            0    0    1    0    5    0    6    0        0
  CD4 CTL              0    0    0    0    0    0    0   12        1
  CD4 Naive            0    0    0    7    0    0    0  523     1512
  CD4 Proliferating 2461 2852 5452 5391 4732 4002 4115    0        6
  CD4 TCM           3320  270  887  562  178  557  517 4576     1963
  CD4 TEM              1    0    0    0    0    0    0   60       23
  CD8 Proliferating    0    0    0    0    0    1    1    0        0
  CD8 TCM              1   16    0    0    0    0    0    0        0
  CD8 TEM              1    8    0    0    2    3    1    0        0
  cDC1                 0    0    0    0    2    6    0    0        0
  cDC2                 0    0    0    4   11    3   35    0        0
  dnT                  2    3    0    1    2    5    2    0        0
  HSPC                 0    0   60    7 1035  213  490    0        0
  NK Proliferating    38 2785    6   24   11  259   38    0        0
  Treg                 1    1    9    9    4   15    6    0        0
table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$seurat_clusters)
                   
                        0     1     2     3     4     5     6     7
  B intermediate        0     0     0     4     0     0     3     0
  B memory            199     4     0    28     0     6    15     0
  CD14 Mono            11     0     0     0     0     0     0     1
  CD4 CTL               0     0     9     1     2     0     0     1
  CD4 Naive             0     0  1990    49     2     0     0     1
  CD4 Proliferating 10219 10418   130  2336  2782  2493   633     0
  CD4 TCM            1440  1033  6207  3854   138    43    64    51
  CD4 TEM               0     0    82     1     1     0     0     0
  CD8 Proliferating     2     0     0     0     0     0     0     0
  CD8 TCM               0     1     0    13     3     0     0     0
  CD8 TEM               6     0     0     7     2     0     0     0
  cDC1                  4     0     0     1     0     0     3     0
  cDC2                 48     0     0     0     0     1     2     2
  dnT                   7     0     0     8     0     0     0     0
  HSPC               1039    58     0    25     1   649    33     0
  NK Proliferating    270    25    24    87  2739     7     9     0
  Treg                  2     0     1    42     0     0     0     0
table(All_samples_Merged$cell_line, All_samples_Merged$seurat_clusters)
          
              0    1    2    3    4    5    6    7
  L1        425  163  132 4909  185    5    6    0
  L2         26   15   27  433 5432    1    1    0
  L3         21 6151    1  172    9    1   72    1
  L4        410 5184    3  150    5    1  252    2
  L5       2617   13    2  124    5 2983  278    0
  L6       4794    6    5  209   10   33   91    0
  L7       4945    7    1  147    6  175   50    0
  PBMC        7    0 4938  165   18    0   12   31
  PBMC_10x    2    0 3334  147    0    0    0   22

5. Marker Gene Visualization



# Set marker genes specific to requested immune cell types
myfeatures1 <- c("CD19", "CD79A", "MS4A1", # B cells
                "CD14", "LYZ", "FCGR3A", # Monocytes
                "CSF1R", "CD68", # Macrophages
                "NKG7", "GNLY", "KIR3DL1", # NK cells
                "MKI67", # Proliferating NK cells
                "CD34", "KIT", # HSPCs
                "CD3E", "CCR7", # T cells
                "SELL", "CD45RO", # Tnaive, Tcm
                "CD44", "CD45RA") # Tem, Temra

cd4_feature_plot1 <- FeaturePlot(
  All_samples_Merged, 
  features = myfeatures1, 
  reduction = "umap.harmony", 
  ncol = 4
) + 
  ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
  NoLegend()
Avis : Could not find CD45RO in the default search locations, found in 'ADT' assay insteadAvis : Could not find CD45RA in the default search locations, found in 'ADT' assay instead
# Display the plot
print(cd4_feature_plot1)


# Define markers specific to CD4 T cells and their subsets
cd4_markers <- c(
  "CD4",          # General CD4 T cells
  "IL7R",         # Naive T cells
  "CCR7",         # T central memory (Tcm) cells
  "SELL",         # T naive cells
  "FOXP3",        # Regulatory T cells (Tregs)
  "IL2RA",        # Activated T cells
  "PDCD1",        # Exhausted T cells
  "LAG3",         # Exhausted T cells
  "TIGIT",        # Exhausted T cells
  "GATA3",        # Th2 cells
  "TBX21",        # Th1 cells
  "RORC",         # Th17 cells
  "BCL6"          # T follicular helper (Tfh) cells
)

# Visualize marker genes for CD4 T cells
cd4_feature_plot2 <- FeaturePlot(
  All_samples_Merged, 
  features = cd4_markers, 
  reduction = "umap.harmony", 
  ncol = 4
) + 
  ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
  NoLegend()

# Display the plot
print(cd4_feature_plot2)

CD4 T Cell Marker Visualization

# Set marker genes specific to CD4 T cell biology and states
cd4_markers <- c(
    # Core T cell markers
    "CD3E",     # T cell marker
    "CD4",      # CD4 T cell marker
    
    # Naive/Memory markers
    "CCR7",     # Naive/Central memory
    "SELL",     # L-selectin, naive marker
    "CD27",     # Memory marker
    "IL7R",     # Naive/Memory marker
    
    # Activation/State markers
    "IL2RA",    # CD25, activation marker
    "CD69",     # Early activation
    "HLA-DRA",  # Activation marker
    
    # Exhaustion markers
    "PDCD1",    # PD-1
    "LAG3",     # Exhaustion marker
    "TIGIT",    # Exhaustion marker
    
    # Regulatory T cell markers
    "FOXP3",    # Treg marker
    "IL2RA",    # CD25, Treg marker
    "CTLA4",    # Treg/exhaustion marker
    
    # Effector/Function markers
    "IL2",      # T cell function
    "IFNG",     # Th1
    "IL4",      # Th2
    "IL13",     # Th2
    "IL17A"     # Th17
)

# Create feature plots with better visualization
FeaturePlot(All_samples_Merged, 
            features = cd4_markers, 
            reduction = "umap.harmony", 
            ncol = 4,
            pt.size = 0.1,           # Smaller point size for better resolution
            min.cutoff = "q1",       # Remove bottom 1% of expression
            max.cutoff = "q99",      # Remove top 1% of expression
            order = TRUE) +          # Plot highest expressing cells on top
    ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
    theme(plot.title = element_text(size = 16, face = "bold")) +
    NoLegend()


# Optional: Add violin plots to see expression distribution across clusters
VlnPlot(All_samples_Merged, 
        features = cd4_markers[1:20], # First 8 markers
        stack = TRUE,
        flip = TRUE) +
        ggtitle("CD4 T Cell Marker Distribution Across Clusters")

NA
NA

FindMarkers

# Idents(All_samples_Merged) <- "seurat_clusters"
# 
# markers1 <- FindMarkers (All_samples_Merged,
#                               ident.1 = c("0","2", "3", "8","1", "5", "6", "9", "10"), #cell lines
#                               ident.2 = c("4","7"), #CD4 T cells
#                               assay = "SCT",
#                               test.use =  "MAST",
#                               latent.vars = c("cell_line")) #batch
# 
# markers2 <- FindMarkers(All_samples_Merged,
#                         ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), #cell lines
#                         ident.2 = c("4", "7"), # CD4 T cells
#                         assay = "SCT",
#                         test.use = "wilcox")  # Change to Wilcoxon test
#                         
# 
# 
# markers3 <- FindMarkers(All_samples_Merged,
#                         ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), # cell lines
#                         ident.2 = c("4", "7"), # CD4 T cells
#                         assay = "SCT",
#                         test.use = "MAST")  # Use MAST
#                         
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results.csv", row.names = TRUE)
# 
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2.csv", row.names = TRUE)
# 
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3.csv", row.names = TRUE)
# 
# #Marker1
# 
# # Subset cells in each group
# group1_cells <- WhichCells(All_samples_Merged, idents = c("0", "2", "3", "8", "1", "5", "6", "9", "10"))
# group2_cells <- WhichCells(All_samples_Merged, idents = c("4", "7"))
# 
# # Extract normalized expression values
# expression_data <- GetAssayData(All_samples_Merged, slot = "data")  # log-normalized values
# 
# # Calculate mean expression for each group
# group1_mean <- rowMeans(expression_data[, group1_cells])
# group2_mean <- rowMeans(expression_data[, group2_cells])
# 
# 
# # Add mean expression to markers result
# markers1$mean_expr_group1 <- group1_mean[rownames(markers1)]
# markers1$mean_expr_group2 <- group2_mean[rownames(markers1)]
# 
# 
# #Marker2
# 
# # Add mean expression to markers result
# markers2$mean_expr_group1 <- group1_mean[rownames(markers2)]
# markers2$mean_expr_group2 <- group2_mean[rownames(markers2)]
# 
# #Marker3
# 
# # Add mean expression to markers result
# markers3$mean_expr_group1 <- group1_mean[rownames(markers3)]
# markers3$mean_expr_group2 <- group2_mean[rownames(markers3)]
# 
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results1_with_meanExpression.csv", row.names = TRUE)
# 
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2_with_meanExpression.csv", row.names = TRUE)
# 
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3_with_meanExpression.csv", row.names = TRUE)
# 
# 
# #Marker1
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers1$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
# 
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers1$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
# 
# 
# num_significant <- sum(markers1$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
# 
# 
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers1$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
# 
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers1$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
# 
# #Marker2
# 
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers2$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
# 
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers2$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
# 
# 
# num_significant <- sum(markers2$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
# 
# 
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers2$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
# 
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers2$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
# 
# 
# #Marker3
# 
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers3$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
# 
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers3$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
# 
# 
# num_significant <- sum(markers3$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
# 
# 
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers3$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
# 
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers3$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
# 

4. Save the Seurat object as an Robj file


#save(All_samples_Merged, file = "0-imp_Robj/Harmony_integrated_CD4Tcells_harmony_integrated.Robj")
---
title: "Harmony integrations of PBMC10x by cell_line-theta-0.5 "
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---


# 1. load libraries
```{r setup, include=FALSE}
library(Seurat)
library(SeuratWrappers)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(harmony)
library(ggplot2)
library(reticulate)
library(Azimuth)
library(dplyr)
library(Rtsne)
library(harmony)


```




# 2. Load Seurat Object 
```{r load_seurat, fig.height=6, fig.width=8}

#Load Seurat Object merged from cell lines and a control after filtration
load("../0-R_Objects/CD4Tcells_SCTnormalized_done_on_HPC_inluding_Patient_origin.robj")



# Visualize before Harmony integration
DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "Patient_origin",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Cell Line")


DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "cell_line",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Cell Line")

DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "SCT_snn_res.0.5",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Clusters")

DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "predicted.celltype.l1",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Annotation.l1")


DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "predicted.celltype.l2",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Annotation.l2")

DimPlot(All_samples_Merged, 
              reduction = "umap", 
              group.by = "predicted.celltype.l3",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("Before Harmony - By Annotation.l3")


table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$SCT_snn_res.0.5)


```


# 3.  Perform Harmony Integration
```{r harmony-integration, fig.height=8, fig.width=12}

# Perform Harmony integration
All_samples_Merged <- RunHarmony(All_samples_Merged, 
                                 group.by.vars = "cell_line", 
                                 reduction.use = "pca", 
                                 dim.use = 1:15,
                                 theta = 0.5, 
                                 assay.use = "SCT")

# Find neighbors using the Harmony reduction (you do not need to specify "reduction" here for FindNeighbors)
All_samples_Merged <- FindNeighbors(All_samples_Merged,reduction = "harmony", dims = 1:15)  # Use the first 16 PCs from Harmony integration

# Find clusters based on the neighbors found in the Harmony space
All_samples_Merged <- FindClusters(All_samples_Merged, reduction = "harmony", resolution = 0.2)  # Clustering based on PC space (default)

# Run UMAP on the new Harmony reduction
All_samples_Merged <- RunUMAP(All_samples_Merged, reduction = "harmony", dims = 1:15, reduction.name = "umap.harmony")


```

# 4.  Visualize Harmony Integrated Data
```{r harmony-visualization1, fig.height=8, fig.width=12}

# Visualization after Harmony

# By cell line
p3 <- DimPlot(All_samples_Merged, 
              reduction = "umap.harmony", 
              group.by = "cell_line",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("After Harmony - By Cell Line")

# By clusters
p4 <- DimPlot(All_samples_Merged, 
              reduction = "umap.harmony", 
              group.by = "seurat_clusters",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("After Harmony - By Clusters")

# By cell type annotations
p5 <- DimPlot(All_samples_Merged, 
              reduction = "umap.harmony", 
              group.by = "predicted.celltype.l2",
              label = TRUE, 
              label.box = TRUE) + 
      ggtitle("After Harmony - Cell Type Annotations")

# Print comparison plots
p3 + p4
print(p5)

DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "cell_line", label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - By Cell Line")
DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "seurat_clusters",label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - By Clusters")
DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "predicted.celltype.l2",label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - Annotations")

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$seurat_clusters)

```
##  Visualize Harmony Integrated Data distribution
```{r harmony-tables, fig.height=8, fig.width=12}


table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$cell_line)

table(All_samples_Merged$predicted.celltype.l2, All_samples_Merged$seurat_clusters)

table(All_samples_Merged$cell_line, All_samples_Merged$seurat_clusters)

```
# 5.  Marker Gene Visualization
```{r featureplot-harmony1, fig.height=14, fig.width=18}


# Set marker genes specific to requested immune cell types
myfeatures1 <- c("CD19", "CD79A", "MS4A1", # B cells
                "CD14", "LYZ", "FCGR3A", # Monocytes
                "CSF1R", "CD68", # Macrophages
                "NKG7", "GNLY", "KIR3DL1", # NK cells
                "MKI67", # Proliferating NK cells
                "CD34", "KIT", # HSPCs
                "CD3E", "CCR7", # T cells
                "SELL", "CD45RO", # Tnaive, Tcm
                "CD44", "CD45RA") # Tem, Temra

cd4_feature_plot1 <- FeaturePlot(
  All_samples_Merged, 
  features = myfeatures1, 
  reduction = "umap.harmony", 
  ncol = 4
) + 
  ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
  NoLegend()

# Display the plot
print(cd4_feature_plot1)

# Define markers specific to CD4 T cells and their subsets
cd4_markers <- c(
  "CD4",          # General CD4 T cells
  "IL7R",         # Naive T cells
  "CCR7",         # T central memory (Tcm) cells
  "SELL",         # T naive cells
  "FOXP3",        # Regulatory T cells (Tregs)
  "IL2RA",        # Activated T cells
  "PDCD1",        # Exhausted T cells
  "LAG3",         # Exhausted T cells
  "TIGIT",        # Exhausted T cells
  "GATA3",        # Th2 cells
  "TBX21",        # Th1 cells
  "RORC",         # Th17 cells
  "BCL6"          # T follicular helper (Tfh) cells
)

# Visualize marker genes for CD4 T cells
cd4_feature_plot2 <- FeaturePlot(
  All_samples_Merged, 
  features = cd4_markers, 
  reduction = "umap.harmony", 
  ncol = 4
) + 
  ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
  NoLegend()

# Display the plot
print(cd4_feature_plot2)
```

##  CD4 T Cell Marker Visualization
```{r featureplot-harmony2, fig.height=12, fig.width=16}
# Set marker genes specific to CD4 T cell biology and states
cd4_markers <- c(
    # Core T cell markers
    "CD3E",     # T cell marker
    "CD4",      # CD4 T cell marker
    
    # Naive/Memory markers
    "CCR7",     # Naive/Central memory
    "SELL",     # L-selectin, naive marker
    "CD27",     # Memory marker
    "IL7R",     # Naive/Memory marker
    
    # Activation/State markers
    "IL2RA",    # CD25, activation marker
    "CD69",     # Early activation
    "HLA-DRA",  # Activation marker
    
    # Exhaustion markers
    "PDCD1",    # PD-1
    "LAG3",     # Exhaustion marker
    "TIGIT",    # Exhaustion marker
    
    # Regulatory T cell markers
    "FOXP3",    # Treg marker
    "IL2RA",    # CD25, Treg marker
    "CTLA4",    # Treg/exhaustion marker
    
    # Effector/Function markers
    "IL2",      # T cell function
    "IFNG",     # Th1
    "IL4",      # Th2
    "IL13",     # Th2
    "IL17A"     # Th17
)

# Create feature plots with better visualization
FeaturePlot(All_samples_Merged, 
            features = cd4_markers, 
            reduction = "umap.harmony", 
            ncol = 4,
            pt.size = 0.1,           # Smaller point size for better resolution
            min.cutoff = "q1",       # Remove bottom 1% of expression
            max.cutoff = "q99",      # Remove top 1% of expression
            order = TRUE) +          # Plot highest expressing cells on top
    ggtitle("CD4 T Cell Marker Expression - Harmony Integration") +
    theme(plot.title = element_text(size = 16, face = "bold")) +
    NoLegend()

# Optional: Add violin plots to see expression distribution across clusters
VlnPlot(All_samples_Merged, 
        features = cd4_markers[1:20], # First 8 markers
        stack = TRUE,
        flip = TRUE) +
        ggtitle("CD4 T Cell Marker Distribution Across Clusters")


```

##  FindMarkers
```{r markers, fig.height=12, fig.width=16}
# Idents(All_samples_Merged) <- "seurat_clusters"
# 
# markers1 <- FindMarkers (All_samples_Merged,
#                               ident.1 = c("0","2", "3", "8","1", "5", "6", "9", "10"), #cell lines
#                               ident.2 = c("4","7"), #CD4 T cells
#                               assay = "SCT",
#                               test.use = 	"MAST",
#                               latent.vars	= c("cell_line")) #batch
# 
# markers2 <- FindMarkers(All_samples_Merged,
#                         ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), #cell lines
#                         ident.2 = c("4", "7"), # CD4 T cells
#                         assay = "SCT",
#                         test.use = "wilcox")  # Change to Wilcoxon test
#                         
# 
# 
# markers3 <- FindMarkers(All_samples_Merged,
#                         ident.1 = c("0", "2", "3", "8", "1", "5", "6", "9", "10"), # cell lines
#                         ident.2 = c("4", "7"), # CD4 T cells
#                         assay = "SCT",
#                         test.use = "MAST")  # Use MAST
#                         
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results.csv", row.names = TRUE)
# 
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2.csv", row.names = TRUE)
# 
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3.csv", row.names = TRUE)
# 
# #Marker1
# 
# # Subset cells in each group
# group1_cells <- WhichCells(All_samples_Merged, idents = c("0", "2", "3", "8", "1", "5", "6", "9", "10"))
# group2_cells <- WhichCells(All_samples_Merged, idents = c("4", "7"))
# 
# # Extract normalized expression values
# expression_data <- GetAssayData(All_samples_Merged, slot = "data")  # log-normalized values
# 
# # Calculate mean expression for each group
# group1_mean <- rowMeans(expression_data[, group1_cells])
# group2_mean <- rowMeans(expression_data[, group2_cells])
# 
# 
# # Add mean expression to markers result
# markers1$mean_expr_group1 <- group1_mean[rownames(markers1)]
# markers1$mean_expr_group2 <- group2_mean[rownames(markers1)]
# 
# 
# #Marker2
# 
# # Add mean expression to markers result
# markers2$mean_expr_group1 <- group1_mean[rownames(markers2)]
# markers2$mean_expr_group2 <- group2_mean[rownames(markers2)]
# 
# #Marker3
# 
# # Add mean expression to markers result
# markers3$mean_expr_group1 <- group1_mean[rownames(markers3)]
# markers3$mean_expr_group2 <- group2_mean[rownames(markers3)]
# 
# write.csv(markers1, file = "0-imp_Robj/TEST_MAST_with_batch_results1_with_meanExpression.csv", row.names = TRUE)
# 
# write.csv(markers2, file = "0-imp_Robj/TEST_Wilcox_without_batch_results2_with_meanExpression.csv", row.names = TRUE)
# 
# write.csv(markers3, file = "0-imp_Robj/TEST_MAST_without_batch_results3_with_meanExpression.csv", row.names = TRUE)
# 
# 
# #Marker1
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers1$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
# 
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers1$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
# 
# 
# num_significant <- sum(markers1$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
# 
# 
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers1$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
# 
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers1$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
# 
# #Marker2
# 
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers2$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
# 
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers2$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
# 
# 
# num_significant <- sum(markers2$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
# 
# 
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers2$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
# 
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers2$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
# 
# 
# #Marker3
# 
# # Count genes with p_val_adj = 0
# num_pval0 <- sum(markers3$p_val_adj == 0)
# cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
# 
# # Count genes with p_val_adj = 1
# num_pval1 <- sum(markers3$p_val_adj == 1)
# cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
# 
# 
# num_significant <- sum(markers3$p_val_adj < 0.05)
# cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
# 
# 
# # Genes with avg_log2FC > 1 (upregulated)
# num_upregulated <- sum(markers3$avg_log2FC > 1)
# cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
# 
# # Genes with avg_log2FC < -1 (downregulated)
# num_downregulated <- sum(markers3$avg_log2FC < -1)
# cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
# 


```



# 4. Save the Seurat object as an Robj file
```{r saveROBJ}

#save(All_samples_Merged, file = "0-imp_Robj/Harmony_integrated_CD4Tcells_harmony_integrated.Robj")

```




