1. load libraries

2. Load the filtered list on mean expression


load("/home/bioinfo/2025_NewHarmony_Integrated_Files/0-imp_Robj/0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")

3. Distribution of cells in malignant Clusters


# Load necessary libraries
library(Seurat)
library(ggplot2)
library(viridis)
library(RColorBrewer)

# Step 1: Subset malignant cells (exclude PBMC and PBMC_10x)
malignant_cells <- WhichCells(All_samples_Merged, expression = !(cell_line %in% c("PBMC", "PBMC_10x")))

# Step 2: Extract predicted.celltype.l2 values for malignant cells
malignant_celltypes <- All_samples_Merged@meta.data[malignant_cells, "predicted.celltype.l2", drop = FALSE]

# Step 3: Create a frequency table
celltype_table <- as.data.frame(table(malignant_celltypes$predicted.celltype.l2))
colnames(celltype_table) <- c("cell_type", "nCells")

# Step 4: Generate a soft color palette (Set2 supports up to 8 by default)
n_colors <- length(unique(celltype_table$cell_type))
base_palette <- brewer.pal(min(n_colors, 8), "Set2")

# If >8 colors, extend with "Paired" or loop existing palette
if (n_colors > 8) {
  extra_colors <- brewer.pal(min(n_colors - 8, 12), "Paired")
  color_palette <- c(base_palette, extra_colors)
} else {
  color_palette <- base_palette
}

# Repeat if not enough (fallback)
if (length(color_palette) < n_colors) {
  color_palette <- rep(color_palette, length.out = n_colors)
}

# Step 5: Plot
celltype_barplot <- ggplot(celltype_table, aes(x = reorder(cell_type, -nCells), y = nCells, fill = cell_type)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nCells), vjust = -0.25) +
  scale_fill_manual(values = color_palette) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  ggtitle("Cell Type Composition in Malignant Cell Lines") +
  xlab("Predicted Cell Type (Azimuth)") +
  ylab("Number of Cells")

# Step 6: Print the plot
print(celltype_barplot)

4. Distribution of cells in malignant Clusters


library(Seurat)
library(ggplot2)
library(RColorBrewer)

# Step 1: Identify HSPC cells based on Azimuth annotation
cDC2_cells <- WhichCells(All_samples_Merged, expression = predicted.celltype.l2 == "cDC2")

# Step 2: Get cluster assignments of HSPC cells
cDC2_clusters <- All_samples_Merged@meta.data[cDC2_cells, "seurat_clusters", drop = FALSE]

# Step 3: Create a frequency table
cDC2_cluster_counts <- as.data.frame(table(cDC2_clusters$seurat_clusters))
colnames(cDC2_cluster_counts) <- c("Cluster", "ncDC2")

# Step 4: Generate color palette (adjustable if you have many clusters)
n_colors <- length(unique(cDC2_cluster_counts$Cluster))
bar_colors <- brewer.pal(min(n_colors, 8), "Set2")
if (n_colors > length(bar_colors)) {
  bar_colors <- rep(bar_colors, length.out = n_colors)
}

# Step 5: Plot
cDC2_cluster_plot <- ggplot(cDC2_cluster_counts, aes(x = reorder(Cluster, -ncDC2), y = ncDC2, fill = Cluster)) +
  geom_col() +
  geom_text(aes(label = ncDC2), vjust = -0.25) +
  scale_fill_manual(values = bar_colors) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  ggtitle("cDC2 Cells per Seurat Cluster") +
  xlab("Seurat Cluster") +
  ylab("Number of cDC2 Cells")

print(cDC2_cluster_plot)

5. Subset the cDC2 Cells (Based on Azimuth Annotation)


# Subset cDC2 cells
cDC2_cells <- subset(All_samples_Merged, subset = predicted.celltype.l2 == "cDC2")

6. Subset Controls


CD4_normal <- subset(
  All_samples_Merged,
  subset = cell_line %in% c("PBMC", "PBMC_10x")
)

7. label them in the full object for DE analysis:


# 1. Set RNA as default assay (if not already)
DefaultAssay(All_samples_Merged) <- "RNA"

# 2. Normalize (only if not already normalized)
All_samples_Merged <- NormalizeData(
  All_samples_Merged,
  normalization.method = "LogNormalize",
  scale.factor = 10000
)
Normalizing layer: counts
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# 3. Get cell names from subset objects
cDC2_cells_names <- colnames(cDC2_cells)
CD4_normal_cells_names <- colnames(CD4_normal)

# 4. Label groups in the full object
All_samples_Merged$cell_type_group <- NA
All_samples_Merged$cell_type_group[cDC2_cells_names] <- "cDC2"
All_samples_Merged$cell_type_group[CD4_normal_cells_names] <- "Normal_CD4"

# Set identities for DE
Idents(All_samples_Merged) <- "cell_type_group"

# 5. Perform DE analysis
de_cDC2_vs_CD4 <- FindMarkers(
  object = All_samples_Merged,
  ident.1 = "cDC2",
  ident.2 = "Normal_CD4",
  assay = "RNA",
  test.use = "wilcox",
  only.pos = FALSE
)
Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.
# 6. Add mean expression per group
rna_data <- GetAssayData(All_samples_Merged, assay = "RNA", slot = "data")
de_cDC2_vs_CD4$mean_expr_cDC2 <- rowMeans(rna_data[rownames(de_cDC2_vs_CD4), cDC2_cells_names])
de_cDC2_vs_CD4$mean_expr_NormalCD4 <- rowMeans(rna_data[rownames(de_cDC2_vs_CD4), CD4_normal_cells_names])

# 7. Save results
write.csv(de_cDC2_vs_CD4, file = "DE_cDC2_vs_NormalCD4.csv", row.names = TRUE)

8. Filter cDC2 Data Based on Mean Expression


# Apply the filter to the cDC2 DE results to remove genes with mean expression < 0.2 in both groups
cDC2_filtered <- de_cDC2_vs_CD4 %>%
  filter(!(mean_expr_cDC2 < 0.2 & mean_expr_NormalCD4 < 0.2))

# Save the filtered DE results
write.csv(cDC2_filtered, file = "cDC2_vs_NormalCD4_filtered_mean_expression.csv", row.names = TRUE)

9. Volcano Plots


library(dplyr)
library(EnhancedVolcano)

# Sort genes by adjusted p-value and absolute log2 fold change
filtered_genes <- de_cDC2_vs_CD4 %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Add gene names as a column for labeling
filtered_genes$gene <- rownames(filtered_genes)


# Create the volcano plot for cDC2 vs normal CD4 T cells
EnhancedVolcano(
  filtered_genes,
  lab = ifelse(filtered_genes$p_val_adj <= 1e-6 & abs(filtered_genes$avg_log2FC) >= 1.5,
               filtered_genes$gene, NA),
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "cDC2 vs Normal CD4 T cells",
  pCutoff = 1e-6,
  FCcutoff = 1.0,
  legendPosition = 'right',
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

Volcano Plots


filtered_genes <- de_cDC2_vs_CD4 %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Add gene names as a column for labeling
filtered_genes$gene <- rownames(filtered_genes)

# Fix zero p-values
min_nonzero_pval <- min(filtered_genes$p_val_adj[filtered_genes$p_val_adj > 0])
filtered_genes$p_val_adj[filtered_genes$p_val_adj == 0] <- min_nonzero_pval * 0.1

# Create safe labels
labels <- ifelse(filtered_genes$p_val_adj <= 1e-6 & abs(filtered_genes$avg_log2FC) >= 1.5,
                 filtered_genes$gene, NA)

selected_labels <- filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]

if (length(selected_labels) == 0) selected_labels <- NA

EnhancedVolcano(
  filtered_genes,
  lab = labels,
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "cDC2 vs Normal CD4 T cells",
  pCutoff = 1e-6,
  FCcutoff = 1.0,
  legendPosition = 'right',
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),
  selectLab = selected_labels
)

NA
NA
NA

10. Enrichment Analysis-All_Pathways



# Define the output folder for cDC2 analysis results
output_folder <- "cDC2_250UP_vs_Control_250DOWN/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Number of genes to select
UP_genes <- 250
Down_genes <- 250

# Thresholds for log fold change
logFC_up_threshold <- 1.5          # Upregulated
logFC_down_threshold <- -1         # Downregulated


cDC2_filtered$gene <- rownames(cDC2_filtered)

# Filter and arrange genes based on thresholds and adjusted p-value
filtered_genes <- cDC2_filtered %>%
  filter(avg_log2FC > logFC_up_threshold | avg_log2FC < logFC_down_threshold) %>%
  arrange(p_val_adj) %>%
  select(gene, everything())  # change 'gene' to your actual gene symbol column name


# Separate up- and downregulated genes
upregulated_genes <- filtered_genes %>% filter(avg_log2FC > logFC_up_threshold)
downregulated_genes <- filtered_genes %>% filter(avg_log2FC < logFC_down_threshold)

# Select top upregulated genes
if (nrow(upregulated_genes) < UP_genes) {
  top_upregulated_genes <- upregulated_genes
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj for last upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
} else {
  top_upregulated_genes <- head(upregulated_genes, UP_genes)
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj for last upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
}
Number of upregulated genes selected: 250 
p_val_adj for last upregulated gene: 0 
# Select top downregulated genes
if (nrow(downregulated_genes) < Down_genes) {
  top_downregulated_genes <- downregulated_genes
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj for last downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
} else {
  top_downregulated_genes <- head(downregulated_genes, Down_genes)
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj for last downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
}
Number of downregulated genes selected: 250 
p_val_adj for last downregulated gene: 6.961907e-97 
# Combine top genes for reference
top_genes <- bind_rows(top_upregulated_genes, top_downregulated_genes) %>% na.omit()

# Save gene lists
write.csv(top_upregulated_genes, paste0(output_folder, "upregulated_genes.csv"), row.names = FALSE)
write.csv(top_downregulated_genes, paste0(output_folder, "downregulated_genes.csv"), row.names = FALSE)

# Convert gene symbols to Entrez IDs
upregulated_entrez <- bitr(top_upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Warning: 2.8% of input gene IDs are fail to map...
downregulated_entrez <- bitr(top_downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Warning: 3.2% of input gene IDs are fail to map...
# Report missing gene symbols
missing_upregulated <- setdiff(top_upregulated_genes$gene, upregulated_entrez$SYMBOL)
missing_downregulated <- setdiff(top_downregulated_genes$gene, downregulated_entrez$SYMBOL)

cat("Missing upregulated genes:\n", missing_upregulated, "\n")
Missing upregulated genes:
 WARS AL590550.1 H2AFY C3orf14 PHB WDR34 PALM2-AKAP2 
cat("Missing downregulated genes:\n", missing_downregulated, "\n")
Missing downregulated genes:
 LINC01578 MT-ND3 HIST1H4C AC119396.1 FAM102A AL138963.4 AC243960.1 AC139720.1 
# Merge Entrez IDs back to gene tables
top_upregulated_genes <- merge(top_upregulated_genes, upregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)
top_downregulated_genes <- merge(top_downregulated_genes, downregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)

# Filter genes without Entrez IDs
top_upregulated_genes <- top_upregulated_genes[!is.na(top_upregulated_genes$ENTREZID), ]
top_downregulated_genes <- top_downregulated_genes[!is.na(top_downregulated_genes$ENTREZID), ]

# Extract Entrez IDs for enrichment
upregulated_entrez <- top_upregulated_genes$ENTREZID
downregulated_entrez <- top_downregulated_genes$ENTREZID

# Functions for enrichment analysis and plotting
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "ENTREZID",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, readable = TRUE)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant GO enrichment for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Run enrichment analyses and save results
safe_enrichGO(top_upregulated_genes$ENTREZID, "GO Enrichment for Upregulated Genes in cDC2", "upregulated_GO_results.csv")

safe_enrichGO(top_downregulated_genes$ENTREZID, "GO Enrichment for Downregulated Genes in cDC2", "downregulated_GO_results.csv")



safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes in cDC2", "upregulated_KEGG_results.csv")

safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes in cDC2", "downregulated_KEGG_results.csv")


safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes in cDC2", "upregulated_Reactome_results.csv")

safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes in cDC2", "downregulated_Reactome_results.csv")

NA
NA

Enrichment Analysis_Hallmark


# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
library(ggplot2)
library(dplyr)

# Define the output folder where the results will be saved
output_folder <- "cDC2_250UP_vs_Control_250DOWN/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", collection = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
top_upregulated_genes$gene <- toupper(top_upregulated_genes$gene)
top_downregulated_genes$gene <- toupper(top_downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(top_upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(top_downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 156 
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 84 
# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes in cDC2")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}


if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes in cDC2")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
NA
---
title: "cDC2 vs Control_filtred_on_mean"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  # pdf_document: default
  # word_document: default
  # html_document: default
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}
# Load libraries
library(Seurat)


# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting
library(dplyr)

```

# 2. Load the filtered list on mean expression
```{r , fig.height=8, fig.width=10}

load("/home/bioinfo/2025_NewHarmony_Integrated_Files/0-imp_Robj/0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")

```

# 3. Distribution of cells in malignant Clusters
```{r , fig.height=6, fig.width=8}

# Load necessary libraries
library(Seurat)
library(ggplot2)
library(viridis)
library(RColorBrewer)

# Step 1: Subset malignant cells (exclude PBMC and PBMC_10x)
malignant_cells <- WhichCells(All_samples_Merged, expression = !(cell_line %in% c("PBMC", "PBMC_10x")))

# Step 2: Extract predicted.celltype.l2 values for malignant cells
malignant_celltypes <- All_samples_Merged@meta.data[malignant_cells, "predicted.celltype.l2", drop = FALSE]

# Step 3: Create a frequency table
celltype_table <- as.data.frame(table(malignant_celltypes$predicted.celltype.l2))
colnames(celltype_table) <- c("cell_type", "nCells")

# Step 4: Generate a soft color palette (Set2 supports up to 8 by default)
n_colors <- length(unique(celltype_table$cell_type))
base_palette <- brewer.pal(min(n_colors, 8), "Set2")

# If >8 colors, extend with "Paired" or loop existing palette
if (n_colors > 8) {
  extra_colors <- brewer.pal(min(n_colors - 8, 12), "Paired")
  color_palette <- c(base_palette, extra_colors)
} else {
  color_palette <- base_palette
}

# Repeat if not enough (fallback)
if (length(color_palette) < n_colors) {
  color_palette <- rep(color_palette, length.out = n_colors)
}

# Step 5: Plot
celltype_barplot <- ggplot(celltype_table, aes(x = reorder(cell_type, -nCells), y = nCells, fill = cell_type)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = nCells), vjust = -0.25) +
  scale_fill_manual(values = color_palette) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  ggtitle("Cell Type Composition in Malignant Cell Lines") +
  xlab("Predicted Cell Type (Azimuth)") +
  ylab("Number of Cells")

# Step 6: Print the plot
print(celltype_barplot)
```
# 4. Distribution of cells in malignant Clusters
```{r , fig.height=6, fig.width=8}

library(Seurat)
library(ggplot2)
library(RColorBrewer)

# Step 1: Identify HSPC cells based on Azimuth annotation
cDC2_cells <- WhichCells(All_samples_Merged, expression = predicted.celltype.l2 == "cDC2")

# Step 2: Get cluster assignments of HSPC cells
cDC2_clusters <- All_samples_Merged@meta.data[cDC2_cells, "seurat_clusters", drop = FALSE]

# Step 3: Create a frequency table
cDC2_cluster_counts <- as.data.frame(table(cDC2_clusters$seurat_clusters))
colnames(cDC2_cluster_counts) <- c("Cluster", "ncDC2")

# Step 4: Generate color palette (adjustable if you have many clusters)
n_colors <- length(unique(cDC2_cluster_counts$Cluster))
bar_colors <- brewer.pal(min(n_colors, 8), "Set2")
if (n_colors > length(bar_colors)) {
  bar_colors <- rep(bar_colors, length.out = n_colors)
}

# Step 5: Plot
cDC2_cluster_plot <- ggplot(cDC2_cluster_counts, aes(x = reorder(Cluster, -ncDC2), y = ncDC2, fill = Cluster)) +
  geom_col() +
  geom_text(aes(label = ncDC2), vjust = -0.25) +
  scale_fill_manual(values = bar_colors) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5),
        legend.position = "none") +
  ggtitle("cDC2 Cells per Seurat Cluster") +
  xlab("Seurat Cluster") +
  ylab("Number of cDC2 Cells")

print(cDC2_cluster_plot)

```
# 5. Subset the cDC2 Cells (Based on Azimuth Annotation)
```{r , fig.height=8, fig.width=10}

# Subset cDC2 cells
cDC2_cells <- subset(All_samples_Merged, subset = predicted.celltype.l2 == "cDC2")


```


# 6. Subset Controls
```{r , fig.height=8, fig.width=10}

CD4_normal <- subset(
  All_samples_Merged,
  subset = cell_line %in% c("PBMC", "PBMC_10x")
)


```

# 7. label them in the full object for DE analysis:
```{r , fig.height=8, fig.width=10}

# 1. Set RNA as default assay (if not already)
DefaultAssay(All_samples_Merged) <- "RNA"

# 2. Normalize (only if not already normalized)
All_samples_Merged <- NormalizeData(
  All_samples_Merged,
  normalization.method = "LogNormalize",
  scale.factor = 10000
)

# 3. Get cell names from subset objects
cDC2_cells_names <- colnames(cDC2_cells)
CD4_normal_cells_names <- colnames(CD4_normal)

# 4. Label groups in the full object
All_samples_Merged$cell_type_group <- NA
All_samples_Merged$cell_type_group[cDC2_cells_names] <- "cDC2"
All_samples_Merged$cell_type_group[CD4_normal_cells_names] <- "Normal_CD4"

# Set identities for DE
Idents(All_samples_Merged) <- "cell_type_group"

# 5. Perform DE analysis
de_cDC2_vs_CD4 <- FindMarkers(
  object = All_samples_Merged,
  ident.1 = "cDC2",
  ident.2 = "Normal_CD4",
  assay = "RNA",
  test.use = "wilcox",
  only.pos = FALSE
)

# 6. Add mean expression per group
rna_data <- GetAssayData(All_samples_Merged, assay = "RNA", slot = "data")
de_cDC2_vs_CD4$mean_expr_cDC2 <- rowMeans(rna_data[rownames(de_cDC2_vs_CD4), cDC2_cells_names])
de_cDC2_vs_CD4$mean_expr_NormalCD4 <- rowMeans(rna_data[rownames(de_cDC2_vs_CD4), CD4_normal_cells_names])

# 7. Save results
write.csv(de_cDC2_vs_CD4, file = "DE_cDC2_vs_NormalCD4.csv", row.names = TRUE)

```

# 8. Filter cDC2 Data Based on Mean Expression
```{r , fig.height=8, fig.width=10}

# Apply the filter to the cDC2 DE results to remove genes with mean expression < 0.2 in both groups
cDC2_filtered <- de_cDC2_vs_CD4 %>%
  filter(!(mean_expr_cDC2 < 0.2 & mean_expr_NormalCD4 < 0.2))

# Save the filtered DE results
write.csv(cDC2_filtered, file = "cDC2_vs_NormalCD4_filtered_mean_expression.csv", row.names = TRUE)




```


# 9. Volcano Plots
```{r , fig.height=12, fig.width=16}

library(dplyr)
library(EnhancedVolcano)

# Sort genes by adjusted p-value and absolute log2 fold change
filtered_genes <- de_cDC2_vs_CD4 %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Add gene names as a column for labeling
filtered_genes$gene <- rownames(filtered_genes)


# Create the volcano plot for cDC2 vs normal CD4 T cells
EnhancedVolcano(
  filtered_genes,
  lab = ifelse(filtered_genes$p_val_adj <= 1e-6 & abs(filtered_genes$avg_log2FC) >= 1.5,
               filtered_genes$gene, NA),
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "cDC2 vs Normal CD4 T cells",
  pCutoff = 1e-6,
  FCcutoff = 1.0,
  legendPosition = 'right',
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)



```
## Volcano Plots
```{r , fig.height=12, fig.width=16}

filtered_genes <- de_cDC2_vs_CD4 %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Add gene names as a column for labeling
filtered_genes$gene <- rownames(filtered_genes)

# Fix zero p-values
min_nonzero_pval <- min(filtered_genes$p_val_adj[filtered_genes$p_val_adj > 0])
filtered_genes$p_val_adj[filtered_genes$p_val_adj == 0] <- min_nonzero_pval * 0.1

# Create safe labels
labels <- ifelse(filtered_genes$p_val_adj <= 1e-6 & abs(filtered_genes$avg_log2FC) >= 1.5,
                 filtered_genes$gene, NA)

selected_labels <- filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]

if (length(selected_labels) == 0) selected_labels <- NA

EnhancedVolcano(
  filtered_genes,
  lab = labels,
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "cDC2 vs Normal CD4 T cells",
  pCutoff = 1e-6,
  FCcutoff = 1.0,
  legendPosition = 'right',
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),
  selectLab = selected_labels
)



```

# 10. Enrichment Analysis-All_Pathways
```{r , fig.height=8, fig.width=12}


# Define the output folder for cDC2 analysis results
output_folder <- "cDC2_250UP_vs_Control_250DOWN/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Number of genes to select
UP_genes <- 250
Down_genes <- 250

# Thresholds for log fold change
logFC_up_threshold <- 1.5          # Upregulated
logFC_down_threshold <- -1         # Downregulated


cDC2_filtered$gene <- rownames(cDC2_filtered)

# Filter and arrange genes based on thresholds and adjusted p-value
filtered_genes <- cDC2_filtered %>%
  filter(avg_log2FC > logFC_up_threshold | avg_log2FC < logFC_down_threshold) %>%
  arrange(p_val_adj) %>%
  select(gene, everything())  # change 'gene' to your actual gene symbol column name


# Separate up- and downregulated genes
upregulated_genes <- filtered_genes %>% filter(avg_log2FC > logFC_up_threshold)
downregulated_genes <- filtered_genes %>% filter(avg_log2FC < logFC_down_threshold)

# Select top upregulated genes
if (nrow(upregulated_genes) < UP_genes) {
  top_upregulated_genes <- upregulated_genes
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj for last upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
} else {
  top_upregulated_genes <- head(upregulated_genes, UP_genes)
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj for last upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
}

# Select top downregulated genes
if (nrow(downregulated_genes) < Down_genes) {
  top_downregulated_genes <- downregulated_genes
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj for last downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
} else {
  top_downregulated_genes <- head(downregulated_genes, Down_genes)
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj for last downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
}

# Combine top genes for reference
top_genes <- bind_rows(top_upregulated_genes, top_downregulated_genes) %>% na.omit()

# Save gene lists
write.csv(top_upregulated_genes, paste0(output_folder, "upregulated_genes.csv"), row.names = FALSE)
write.csv(top_downregulated_genes, paste0(output_folder, "downregulated_genes.csv"), row.names = FALSE)

# Convert gene symbols to Entrez IDs
upregulated_entrez <- bitr(top_upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
downregulated_entrez <- bitr(top_downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)

# Report missing gene symbols
missing_upregulated <- setdiff(top_upregulated_genes$gene, upregulated_entrez$SYMBOL)
missing_downregulated <- setdiff(top_downregulated_genes$gene, downregulated_entrez$SYMBOL)

cat("Missing upregulated genes:\n", missing_upregulated, "\n")
cat("Missing downregulated genes:\n", missing_downregulated, "\n")

# Merge Entrez IDs back to gene tables
top_upregulated_genes <- merge(top_upregulated_genes, upregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)
top_downregulated_genes <- merge(top_downregulated_genes, downregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)

# Filter genes without Entrez IDs
top_upregulated_genes <- top_upregulated_genes[!is.na(top_upregulated_genes$ENTREZID), ]
top_downregulated_genes <- top_downregulated_genes[!is.na(top_downregulated_genes$ENTREZID), ]

# Extract Entrez IDs for enrichment
upregulated_entrez <- top_upregulated_genes$ENTREZID
downregulated_entrez <- top_downregulated_genes$ENTREZID

# Functions for enrichment analysis and plotting
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "ENTREZID",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, readable = TRUE)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant GO enrichment for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Run enrichment analyses and save results
safe_enrichGO(top_upregulated_genes$ENTREZID, "GO Enrichment for Upregulated Genes in cDC2", "upregulated_GO_results.csv")
safe_enrichGO(top_downregulated_genes$ENTREZID, "GO Enrichment for Downregulated Genes in cDC2", "downregulated_GO_results.csv")


safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes in cDC2", "upregulated_KEGG_results.csv")
safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes in cDC2", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes in cDC2", "upregulated_Reactome_results.csv")
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes in cDC2", "downregulated_Reactome_results.csv")


```

## Enrichment Analysis_Hallmark
```{r , fig.height=8, fig.width=12}

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
library(ggplot2)
library(dplyr)

# Define the output folder where the results will be saved
output_folder <- "cDC2_250UP_vs_Control_250DOWN/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", collection = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
top_upregulated_genes$gene <- toupper(top_upregulated_genes$gene)
top_downregulated_genes$gene <- toupper(top_downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(top_upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(top_downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes in cDC2")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes in cDC2")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}



```



