1. Load Libraries
2. Load Seurat Object
3. Set Up Identifiers for Clustering
# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T, repel = T) +
ggtitle("Harmony Integration - By Clusters")

4. Differential Expression Analysis
4.1 Using Wilcoxon Test (Without Batch Correction)
markers_wilcox <- FindMarkers(
All_samples_Merged,
ident.1 = c("1", "5", "9", "2", "6", "8", "0", "4", "7", "11", "12", "13"), # Cell lines-Malignant
ident.2 = c("3", "10"), # Normal CD4 T cells
assay = "SCT",
test.use = "wilcox",
min.pct = 0,
logfc.threshold = 0
)
write.csv(markers_wilcox, file = "1-Wilcox_SCT_newUMAP.csv", row.names = TRUE)
5. Compute Mean Expression for Groups
# Define cell groups
group1_cells <- WhichCells(All_samples_Merged, idents = c("1", "5", "9", "2", "6", "8", "0", "4", "7", "11", "12", "13"))
group2_cells <- WhichCells(All_samples_Merged, idents = c("3", "10"))
# Extract normalized expression values
expression_data <- GetAssayData(All_samples_Merged, slot = "data") # Log-normalized values
Avis : The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.
# Calculate mean expression for each group
calculate_mean_expression <- function(markers, group1_cells, group2_cells, expression_data) {
group1_mean <- rowMeans(expression_data[, group1_cells])
group2_mean <- rowMeans(expression_data[, group2_cells])
markers <- markers %>%
rownames_to_column("gene") %>%
mutate(mean_expr_group1 = group1_mean[gene],
mean_expr_group2 = group2_mean[gene])
return(markers)
}
markers_wilcox <- calculate_mean_expression(markers_wilcox, group1_cells, group2_cells, expression_data)
write.csv(markers_wilcox, file = "1-Wilcox_SCT_newUMAP_with_meanExpression.csv", row.names = TRUE)
6. Summarize Results
6.1 Summary Function
summarize_markers <- function(markers) {
num_pval0 <- sum(markers$p_val_adj == 0)
num_pval1 <- sum(markers$p_val_adj == 1)
num_significant <- sum(markers$p_val_adj < 0.05)
num_upregulated <- sum(markers$avg_log2FC > 1)
num_downregulated <- sum(markers$avg_log2FC < -1)
cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
}
6.3 Summary for Markers (Wilcoxon Test)
cat("Markers Summary (Wilcoxon Test):\n")
Markers Summary (Wilcoxon Test):
summarize_markers(markers_wilcox)
Number of genes with p_val_adj = 0: 4208
Number of genes with p_val_adj = 1: 10428
Number of significant genes (p_val_adj < 0.05): 15043
Number of upregulated genes (avg_log2FC > 1): 15048
Number of downregulated genes (avg_log2FC < -1): 2281
7. Visualization
7.1 Volcano Plot for Wilcoxon Test
EnhancedVolcano(markers_wilcox,
lab = markers_wilcox$gene,
x = "avg_log2FC",
y = "p_val_adj",
title = "Wilcox with SCT on NewUMAP",
pCutoff = 0.05,
FCcutoff = 1.0)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

8. Filter and Summarize Results
8.1 Apply Expression Filter First
# Apply the expression filter first
markers_wilcox <- markers_wilcox %>%
filter(!(mean_expr_group1 < 0.2 & mean_expr_group2 < 0.2))
markers_wilcox <- markers_wilcox %>%
filter(!(mean_expr_group1 < 0.2 & mean_expr_group2 < 0.2))
# Save filtered results
write.csv(markers_wilcox, file = "3-Wilcox_SCT_newUMAP_with_meanExpression_FILTERED.csv", row.names = TRUE)
8.2 Volcano Plot for Wilcoxon Test
EnhancedVolcano(markers_wilcox,
lab = markers_wilcox$gene,
x = "avg_log2FC",
y = "p_val_adj",
title = "Wilcox with SCT on NewUMAP",
pCutoff = 0.05,
FCcutoff = 1.0)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

9. Create the EnhancedVolcano plot
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- markers_wilcox
library(ggplot2)
library(EnhancedVolcano)
library(dplyr)
# Define the output directory
output_dir <- "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells"
dir.create(output_dir, showWarnings = FALSE)
# First Volcano Plot
p1 <- EnhancedVolcano(
Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
pCutoff = 1e-4,
FCcutoff = 1.0
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p1) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)
# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_log2FC",
y = "p_val_adj",
selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9',
'CD80', 'IL1B', 'RPS4Y1',
'IL7R', 'TCF7', 'MKI67', 'CD70',
'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9',
'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL',
'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4',
'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 1e-4,
FCcutoff = 1.5,
pointSize = 3.0,
labSize = 5.0,
boxedLabels = TRUE,
colAlpha = 0.5,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.5,
colConnectors = 'grey50',
arrowheads = FALSE,
max.overlaps = 30
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p2) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)
# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
arrange(p_val_adj, desc(abs(avg_log2FC)))
# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 1e-4,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p3) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)
# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
subtitle = "Highlighting differentially expressed genes",
pCutoff = 1e-4,
FCcutoff = 1.0,
legendPosition = 'right',
colAlpha = 0.8, # Slight transparency for non-significant points
col = c('grey70', 'black', 'blue', 'red'), # Custom color scheme
gridlines.major = TRUE,
gridlines.minor = FALSE,
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p4) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)
message("All volcano plots have been displayed and saved successfully in the 'Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells' folder.")
All volcano plots have been displayed and saved successfully in the 'Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells' folder.
10. Enrichment Analysis-1
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting
# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 3.5 # Upregulated logFC threshold
logFC_down_threshold <- -1 # Downregulated logFC threshold
pval_threshold <- 1e-4 # p-value threshold as specified
# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")
# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > logFC_up_threshold &
Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]
downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < logFC_down_threshold &
Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]
# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)
# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/upregulated_genes_wilcox.csv", row.names = FALSE)
write.csv(downregulated_genes, "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/downregulated_genes_wilcox.csv", row.names = FALSE)
# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 2.76% of input gene IDs are fail to map...
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 4% of input gene IDs are fail to map...
# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]
# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
Missing upregulated genes:
cat("Missing downregulated genes:\n", missing_downregulated, "\n")
Missing downregulated genes:
# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]
# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
if (length(gene_list) > 0) {
result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
p <- dotplot(result, showCategory = 10, title = title)
print(p)
ggsave(paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
write.csv(as.data.frame(result), file = paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", filename), row.names = FALSE)
} else {
message(paste("No significant enrichment found for:", title))
}
} else {
message(paste("No genes found for:", title))
}
}
safe_enrichKEGG <- function(entrez_list, title, filename) {
if (length(entrez_list) > 0) {
result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
p <- dotplot(result, showCategory = 10, title = title)
print(p)
ggsave(paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
write.csv(as.data.frame(result), file = paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", filename), row.names = FALSE)
} else {
message(paste("No significant KEGG pathways found for:", title))
}
} else {
message(paste("No genes found for:", title))
}
}
safe_enrichReactome <- function(entrez_list, title, filename) {
if (length(entrez_list) > 0) {
result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
p <- dotplot(result, showCategory = 10, title = title)
print(p)
ggsave(paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
write.csv(as.data.frame(result), file = paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", filename), row.names = FALSE)
} else {
message(paste("No significant Reactome pathways found for:", title))
}
} else {
message(paste("No genes found for:", title))
}
}
# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")

safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")

safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")

safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")

NA
NA
10.2. Enrichment Analysis-2-Hallmark
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") # "H" is for Hallmark gene sets
# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)
# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)
# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 322
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 43
# Define the output folder where the results will be saved
output_folder <- "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/"
# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
# Perform enrichment analysis for upregulated genes using Hallmark gene sets
hallmark_up <- enricher(gene = upregulated_in_hallmark,
TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")], # Ensure TERM2GENE uses correct columns
pvalueCutoff = 0.05)
# Check if results exist
if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
# Visualize results if available
up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
# Display the plot in the notebook
print(up_dotplot)
# Save the dotplot to a PNG file
ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
# Optionally, save the results as CSV
write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
} else {
cat("No significant enrichment found for upregulated genes.\n")
}
} else {
cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
# Perform enrichment analysis for downregulated genes using Hallmark gene sets
hallmark_down <- enricher(gene = downregulated_in_hallmark,
TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")], # Ensure TERM2GENE uses correct columns
pvalueCutoff = 0.05)
# Check if results exist
if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
# Visualize results if available
down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
# Display the plot in the notebook
print(down_dotplot)
# Save the dotplot to a PNG file
ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
# Optionally, save the results as CSV
write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
} else {
cat("No significant enrichment found for downregulated genes.\n")
}
} else {
cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
---
title: "Wilcox-SCT-Newumap-DE Analysis using Harmony Integrated Clusters"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    toc_collapsed: yes
  pdf_document:
    toc: yes
---

# 1. Load Libraries
```{r setup, include=FALSE}
library(Seurat)
library(harmony)
library(dplyr)
library(EnhancedVolcano)
library(ggplot2)
library(pheatmap)
library(tibble)

```


# 2. Load Seurat Object 
```{r load_seurat, include=FALSE}

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("../23-Harmony_Integration/0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")

All_samples_Merged 



```

# 3. Set Up Identifiers for Clustering
```{r}
# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"


DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T, repel = T) + 
  ggtitle("Harmony Integration - By Clusters")
```

# 4.  Differential Expression Analysis
## 4.1 Using Wilcoxon Test (Without Batch Correction)
```{r}
markers_wilcox <- FindMarkers(
  All_samples_Merged,
 ident.1 = c("1", "5", "9", "2", "6", "8", "0", "4", "7", "11", "12", "13"), # Cell lines-Malignant
  ident.2 = c("3", "10"), # Normal CD4 T cells
  assay = "SCT",
  test.use = "wilcox",
  min.pct = 0,
  logfc.threshold = 0
)
write.csv(markers_wilcox, file = "1-Wilcox_SCT_newUMAP.csv", row.names = TRUE)
```


# 5. Compute Mean Expression for Groups
```{r}
# Define cell groups
group1_cells <- WhichCells(All_samples_Merged, idents = c("1", "5", "9", "2", "6", "8", "0", "4", "7", "11", "12", "13"))
group2_cells <- WhichCells(All_samples_Merged, idents = c("3", "10"))

# Extract normalized expression values
expression_data <- GetAssayData(All_samples_Merged, slot = "data")  # Log-normalized values

# Calculate mean expression for each group
calculate_mean_expression <- function(markers, group1_cells, group2_cells, expression_data) {
  group1_mean <- rowMeans(expression_data[, group1_cells])
  group2_mean <- rowMeans(expression_data[, group2_cells])
  
  markers <- markers %>%
    rownames_to_column("gene") %>%
    mutate(mean_expr_group1 = group1_mean[gene],
           mean_expr_group2 = group2_mean[gene])
  
  return(markers)
}


markers_wilcox <- calculate_mean_expression(markers_wilcox, group1_cells, group2_cells, expression_data)



write.csv(markers_wilcox, file = "2-Wilcox_SCT_newUMAP_with_meanExpression.csv", row.names = TRUE)

```

# 6. Summarize Results

## 6.1 Summary Function
```{r}
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_log2FC > 1)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
}

```



## 6.3 Summary for Markers (Wilcoxon Test)
```{r}
cat("Markers Summary (Wilcoxon Test):\n")
summarize_markers(markers_wilcox)
```


# 7. Visualization

## 7.1 Volcano Plot for Wilcoxon Test
```{r, fig.height=10, fig.width=12}
EnhancedVolcano(markers_wilcox,
                lab = markers_wilcox$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "Wilcox with SCT on NewUMAP",
                pCutoff = 0.05,
                FCcutoff = 1.0)

```

# 8. Filter and Summarize Results
## 8.1 Apply Expression Filter First
```{r}
# Apply the expression filter first
markers_wilcox <- markers_wilcox %>%
  filter(!(mean_expr_group1 < 0.2 & mean_expr_group2 < 0.2))


markers_wilcox <- markers_wilcox %>%
  filter(!(mean_expr_group1 < 0.2 & mean_expr_group2 < 0.2))

# Save filtered results
write.csv(markers_wilcox, file = "3-Wilcox_SCT_newUMAP_with_meanExpression_FILTERED.csv", row.names = TRUE)

```



## 8.2 Volcano Plot for Wilcoxon Test
```{r, fig.height=10, fig.width=12}
EnhancedVolcano(markers_wilcox,
                lab = markers_wilcox$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "Wilcox with SCT on Filtered on mean Expression",
                pCutoff = 0.05,
                FCcutoff = 1.0)

```

# 9. Create the EnhancedVolcano plot
```{r , fig.height=8, fig.width=12}

Malignant_CD4Tcells_vs_Normal_CD4Tcells <- markers_wilcox

library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells"
dir.create(output_dir, showWarnings = FALSE)

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
print(p1)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_log2FC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                'CD80',  'IL1B', 'RPS4Y1', 
                'IL7R', 'TCF7',  'MKI67', 'CD70', 
                'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 1e-4,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
print(p2)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
print(p3)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
print(p4)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells' folder.")



```


# 10. Enrichment Analysis-1
```{r , fig.height=6, fig.width=8}
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 3.5          # Upregulated logFC threshold
logFC_down_threshold <- -1      # Downregulated logFC threshold
pval_threshold <- 1e-4  # p-value threshold as specified

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > logFC_up_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < logFC_down_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/upregulated_genes_wilcox.csv", row.names = FALSE)
write.csv(downregulated_genes, "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/downregulated_genes_wilcox.csv", row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)

# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
cat("Missing downregulated genes:\n", missing_downregulated, "\n")

# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/", filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")
safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")

safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")


```




# 10.2. Enrichment Analysis-2-Hallmark
```{r , fig.height=6, fig.width=8}

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")

# Define the output folder where the results will be saved
output_folder <- "Wilcox_SCT_Malignant_CD4Tcells_vs_Normal_CD4Tcells/"

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}


```

