1. Load Libraries

2. load seurat object

#Load Seurat Object L7
load("../0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")


All_samples_Merged

3. Set Up Identifiers for Clustering

# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"

DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line",label = T, label.box = T)

DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T)

4. Differential Expression Analysis

4.1 Using MAST with SCT and Latent Variables (Batch Correction)

markers_mast_SCT <- FindMarkers(
  All_samples_Merged,
  ident.1 = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"), # Cell lines-Malignant
  ident.2 = c("3", "10"), # Normal CD4 T cells
  assay = "SCT",
  test.use = "MAST",
  latent.vars = c("cell_line", "Patient_origin"), # Adjust for batch effects
  min.pct = 0,
  logfc.threshold = 0
)
write.csv(markers_mast_SCT, file = "../0-robj/1-MAST_with_SCT_batch_patient_cellline_as_Covariate.csv", row.names = TRUE)

4.2 Using MAST with RNA and Latent Variables (Batch Correction)

markers_mast_RNA <- FindMarkers(
  All_samples_Merged,
 ident.1 = cc("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"), # Cell lines-Malignant
  ident.2 = c("3", "10"), # Normal CD4 T cells
  assay = "RNA",
  test.use = "MAST",
  latent.vars = c("cell_line", "Patient_origin"), # Adjust for batch effects
  min.pct = 0,
  logfc.threshold = 0
)
Erreur dans cc("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13") : 
  impossible de trouver la fonction "cc"

5. Compute Mean Expression for Groups

library(Seurat)
library(dplyr)

# Define cell groups
group1_cells <- WhichCells(All_samples_Merged, idents = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"))  # Malignant CD4+ T cells
group2_cells <- WhichCells(All_samples_Merged, idents = c("3", "10"))  # Normal CD4+ T cells

# Extract normalized expression values for both assays
expression_data_RNA <- GetAssayData(All_samples_Merged, assay = "RNA", layer = "data")  # Log-normalized counts
expression_data_SCT <- GetAssayData(All_samples_Merged, assay = "SCT", layer = "data")  # SCT-normalized counts (not Pearson residuals)

# Function to calculate mean expression for each group
calculate_mean_expression <- function(markers, group1_cells, group2_cells, expression_data) {
  group1_mean <- rowMeans(expression_data[, group1_cells, drop = FALSE], na.rm = TRUE)
  group2_mean <- rowMeans(expression_data[, group2_cells, drop = FALSE], na.rm = TRUE)
  
  markers <- markers %>%
    rownames_to_column("gene") %>%
    mutate(mean_expr_group1 = group1_mean[gene],
           mean_expr_group2 = group2_mean[gene])
  
  return(markers)
}

# Compute mean expression for RNA and SCT assays
markers_mast_RNA <- calculate_mean_expression(markers_mast_RNA, group1_cells, group2_cells, expression_data_RNA)
markers_mast_SCT <- calculate_mean_expression(markers_mast_SCT, group1_cells, group2_cells, expression_data_SCT)

# Save updated results with mean expression
write.csv(markers_mast_RNA, file = "../0-robj/2-MAST_with_RNA_batch_patient_cellline_as_Covariate_with_meanExpression.csv", row.names = TRUE)
write.csv(markers_mast_SCT, file = "../0-robj/1-MAST_with_SCT_batch_patient_cellline_as_Covariate_with_meanExpression.csv", row.names = TRUE)

6. Summarize Results

6.1 Summary Function

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_log2FC > 1)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
}

6.2 Summarize Markers1 (MAST with SCT Assay)

cat("Markers1 Summary (MAST with SCT Assay):\n")
Markers1 Summary (MAST with SCT Assay):
summarize_markers(markers_mast_SCT)
Number of genes with p_val_adj = 0: 74 
Number of genes with p_val_adj = 1: 19337 
Number of significant genes (p_val_adj < 0.05): 5615 
Number of upregulated genes (avg_log2FC > 1): 15048 
Number of downregulated genes (avg_log2FC < -1): 2281 

6.3 Summary for Markers2 (MAST with RNA Assay)

cat("Markers2 Summary (MAST with RNA Assay):\n")
Markers2 Summary (MAST with RNA Assay):
summarize_markers(markers_mast_RNA)
Number of genes with p_val_adj = 0: 83 
Number of genes with p_val_adj = 1: 26400 
Number of significant genes (p_val_adj < 0.05): 9221 
Number of upregulated genes (avg_log2FC > 1): 7334 
Number of downregulated genes (avg_log2FC < -1): 15426 

Compare Results Between SCT and RNA Assays

### Compare Results Between RNA and SCT Assays

# Load gene sets
genes_mast_sct <- markers_mast_SCT$gene
genes_mast_rna <- markers_mast_RNA$gene
common_genes_mast <- intersect(genes_mast_sct, genes_mast_rna)

cat("MAST: Common Genes Between SCT and RNA:", length(common_genes_mast), "\n")
MAST: Common Genes Between SCT and RNA: 26176 

7. Visualization

7.1 Volcano Plot for MAST-SCT with Batch Correction


EnhancedVolcano(markers_mast_SCT,
                lab = markers_mast_SCT$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "Malignant_vs_Normal_MAST_SCT",
                pCutoff = 0.05,
                FCcutoff = 1.0)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

7.2 Volcano Plot MAST-RNA with Batch Correction


EnhancedVolcano(markers_mast_RNA,
                lab = markers_mast_RNA$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "Malignant_vs_Normal_MAST_RNA",
                pCutoff = 0.05,
                FCcutoff = 1.0)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

7.3 Volcano Plot MAST-RNA with Batch Correction


### 1. Load Data and Prepare for Comparison

library(ggplot2)

df_sct <- read.csv("../0-robj/1-MAST_with_SCT_batch_patient_cellline_as_Covariate.csv", row.names = 1)
df_rna <- read.csv("../0-robj/2-MAST_with_RNA_batch_patient_cellline_as_Covariate.csv", row.names = 1)

df_sct$assay <- "SCT"
df_rna$assay <- "RNA"

df_combined <- rbind(df_sct, df_rna)

### 2. Volcano Plot for SCT vs RNA Comparison

ggplot(df_combined, aes(x = avg_log2FC, y = -log10(p_val_adj), color = assay)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Volcano Plot: RNA vs SCT", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
  scale_color_manual(values = c("SCT" = "blue", "RNA" = "red"))

7.4 Correlation of Log2 Fold Change Between RNA and SCT


# Correlation of Log2 Fold Change Between RNA and SCT
# Goal: See if genes have consistent fold changes between RNA and SCT normalization.


library(dplyr)  # Load dplyr to use rownames_to_column

# Convert row names to a "gene" column
df_sct <- df_sct %>% rownames_to_column(var = "gene")
df_rna <- df_rna %>% rownames_to_column(var = "gene")

# Merge datasets by gene
df_merged <- merge(df_sct, df_rna, by = "gene", suffixes = c("_SCT", "_RNA"))

# Scatter plot of log2FC values
ggplot(df_merged, aes(x = avg_log2FC_SCT, y = avg_log2FC_RNA)) +
  geom_point(alpha = 0.7, color = "blue") +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Log2 Fold Change Comparison: SCT vs RNA",
       x = "SCT Log2 Fold Change",
       y = "RNA Log2 Fold Change") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "black")

7.5 Find overlap between the top 100 most significant genes:


cor(df_merged$avg_log2FC_SCT, df_merged$avg_log2FC_RNA, use = "complete.obs")
[1] 0.9815925
# Higher correlation (~0.8-1.0) suggests consistency between methods.
# Lower correlation (<0.6) may indicate different normalization effects.


library(ggplot2)
library(dplyr)

# Define a broader list of known Sezary genes
sezary_genes <- c("KIR3DL2", "TOX", "TWIST1", "CD52", "CCR4", "IL2RA", "CD7", "DPP4", "CCR7", "GATA3", "FOXP3", "CD8A", "CD4", "CYP27B1", "TET2")

# Filter the merged data to include only the known Sezary genes
df_filtered <- df_merged %>%
  filter(gene %in% sezary_genes)

# Volcano plot showing only the known Sezary genes with gene names
ggplot(df_filtered, aes(x = avg_log2FC_SCT, y = -log10(p_val_adj_SCT), color = gene)) +
  geom_point(alpha = 0.7) +
  geom_text(aes(label = gene), vjust = 1.5, hjust = 0.5, size = 3, color = "black") +  # Adding gene names
  theme_minimal() +
  labs(title = "Volcano Plot: Known Sezary Genes (SCT Assay)", 
       x = "Log2 Fold Change (SCT)", 
       y = "-Log10 Adjusted P-value") +
  scale_color_manual(values = c("KIR3DL2" = "#1f77b4",  # blue
                                "TOX" = "#ff7f0e",  # orange
                                "TWIST1" = "#2ca02c",  # green
                                "CD52" = "#9467bd",  # purple
                                "CCR4" = "#8c564b",  # brown
                                "IL2RA" = "#e377c2",  # pink
                                "CD7" = "#7f7f7f",  # gray
                                "DPP4" = "#bcbd22",  # yellow-green
                                "CCR7" = "#17becf",  # cyan
                                "GATA3" = "#d62728",  # red
                                "FOXP3" = "#8a2be2",  # blue-violet
                                "CD8A" = "#ff6347",  # tomato
                                "CD4" = "#3cb371",  # medium sea green
                                "CYP27B1" = "#ff1493",  # deep pink
                                "TET2" = "#00bfff")) +  # deep sky blue
  theme(legend.position = "top")

---
title: "MAST on SCT and RNA assay and its comparison to see which is better"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    toc_collapsed: yes
  pdf_document:
    toc: yes
---

# 1. Load Libraries
```{r setup, include=FALSE}
library(Seurat)
library(harmony)
library(dplyr)
library(EnhancedVolcano)
library(ggplot2)
library(pheatmap)
library(tibble)
```


# 2. load seurat object
```{r load_seurat}
#Load Seurat Object L7
load("../0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")


All_samples_Merged



```

# 3. Set Up Identifiers for Clustering
```{r}
# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"

DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line",label = T, label.box = T)
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T)

```

# 4.  Differential Expression Analysis

## 4.1 Using MAST with SCT and Latent Variables (Batch Correction)
```{r}
markers_mast_SCT <- FindMarkers(
  All_samples_Merged,
  ident.1 = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"), # Cell lines-Malignant
  ident.2 = c("3", "10"), # Normal CD4 T cells
  assay = "SCT",
  test.use = "MAST",
  latent.vars = c("cell_line", "Patient_origin"), # Adjust for batch effects
  min.pct = 0,
  logfc.threshold = 0
)
write.csv(markers_mast_SCT, file = "../0-robj/1-MAST_with_SCT_batch_patient_cellline_as_Covariate.csv", row.names = TRUE)
```

## 4.2 Using MAST with RNA and Latent Variables (Batch Correction)
```{r}

library(Seurat)
library(magrittr)

# Set the default assay to RNA
DefaultAssay(All_samples_Merged) <- "RNA"

# Normalize the RNA assay but DO NOT scale the data
All_samples_Merged <- All_samples_Merged %>%
  NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000)


markers_mast_RNA <- FindMarkers(
  All_samples_Merged,
 ident.1 = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"), # Cell lines-Malignant
  ident.2 = c("3", "10"), # Normal CD4 T cells
  assay = "RNA",
  test.use = "MAST",
  latent.vars = c("cell_line", "Patient_origin"), # Adjust for batch effects
  min.pct = 0,
  logfc.threshold = 0
)
write.csv(markers_mast_RNA, file = "../0-robj/2-MAST_with_RNA_batch_patient_cellline_as_Covariate.csv", row.names = TRUE)
```



# 5. Compute Mean Expression for Groups
```{r}
library(Seurat)
library(dplyr)

# Define cell groups
group1_cells <- WhichCells(All_samples_Merged, idents = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"))  # Malignant CD4+ T cells
group2_cells <- WhichCells(All_samples_Merged, idents = c("3", "10"))  # Normal CD4+ T cells

# Extract normalized expression values for both assays
expression_data_RNA <- GetAssayData(All_samples_Merged, assay = "RNA", layer = "data")  # Log-normalized counts
expression_data_SCT <- GetAssayData(All_samples_Merged, assay = "SCT", layer = "data")  # SCT-normalized counts (not Pearson residuals)

# Function to calculate mean expression for each group
calculate_mean_expression <- function(markers, group1_cells, group2_cells, expression_data) {
  group1_mean <- rowMeans(expression_data[, group1_cells, drop = FALSE], na.rm = TRUE)
  group2_mean <- rowMeans(expression_data[, group2_cells, drop = FALSE], na.rm = TRUE)
  
  markers <- markers %>%
    rownames_to_column("gene") %>%
    mutate(mean_expr_group1 = group1_mean[gene],
           mean_expr_group2 = group2_mean[gene])
  
  return(markers)
}

# Compute mean expression for RNA and SCT assays
markers_mast_RNA <- calculate_mean_expression(markers_mast_RNA, group1_cells, group2_cells, expression_data_RNA)
markers_mast_SCT <- calculate_mean_expression(markers_mast_SCT, group1_cells, group2_cells, expression_data_SCT)

# Save updated results with mean expression
write.csv(markers_mast_RNA, file = "../0-robj/2-MAST_with_RNA_batch_patient_cellline_as_Covariate_with_meanExpression.csv", row.names = TRUE)
write.csv(markers_mast_SCT, file = "../0-robj/1-MAST_with_SCT_batch_patient_cellline_as_Covariate_with_meanExpression.csv", row.names = TRUE)

```

# 6. Summarize Results

## 6.1 Summary Function
```{r}
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_log2FC > 1)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
}

```

## 6.2 Summarize Markers1 (MAST with SCT Assay)
```{r}
cat("Markers1 Summary (MAST with SCT Assay):\n")
summarize_markers(markers_mast_SCT)

```

## 6.3 Summary for Markers2 (MAST with RNA Assay)
```{r}
cat("Markers2 Summary (MAST with RNA Assay):\n")
summarize_markers(markers_mast_RNA)
```

## Compare Results Between SCT and RNA Assays
```{r}
### Compare Results Between RNA and SCT Assays

# Load gene sets
genes_mast_sct <- markers_mast_SCT$gene
genes_mast_rna <- markers_mast_RNA$gene
common_genes_mast <- intersect(genes_mast_sct, genes_mast_rna)

cat("MAST: Common Genes Between SCT and RNA:", length(common_genes_mast), "\n")

```


# 7. Visualization

## 7.1 Volcano Plot for MAST-SCT with Batch Correction
```{r, fig.height=12, fig.width=12}

EnhancedVolcano(markers_mast_SCT,
                lab = markers_mast_SCT$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "Malignant_vs_Normal_MAST_SCT",
                pCutoff = 0.05,
                FCcutoff = 1.0)

```

## 7.2 Volcano Plot MAST-RNA with Batch Correction
```{r, fig.height=12, fig.width=12}

EnhancedVolcano(markers_mast_RNA,
                lab = markers_mast_RNA$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "Malignant_vs_Normal_MAST_RNA",
                pCutoff = 0.05,
                FCcutoff = 1.0)

```

## 7.3 Volcano Plot MAST-RNA with Batch Correction
```{r, fig.height=12, fig.width=12}

### 1. Load Data and Prepare for Comparison

library(ggplot2)

df_sct <- read.csv("../0-robj/1-MAST_with_SCT_batch_patient_cellline_as_Covariate.csv", row.names = 1)
df_rna <- read.csv("../0-robj/2-MAST_with_RNA_batch_patient_cellline_as_Covariate.csv", row.names = 1)

df_sct$assay <- "SCT"
df_rna$assay <- "RNA"

df_combined <- rbind(df_sct, df_rna)

### 2. Volcano Plot for SCT vs RNA Comparison

ggplot(df_combined, aes(x = avg_log2FC, y = -log10(p_val_adj), color = assay)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Volcano Plot: RNA vs SCT", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
  scale_color_manual(values = c("SCT" = "blue", "RNA" = "red"))

```


## 7.4 Correlation of Log2 Fold Change Between RNA and SCT
```{r, fig.height=12, fig.width=12}

# Correlation of Log2 Fold Change Between RNA and SCT
# Goal: See if genes have consistent fold changes between RNA and SCT normalization.


library(dplyr)  # Load dplyr to use rownames_to_column


# Convert row names to a "gene" column
df_sct <- df_sct %>% rownames_to_column(var = "gene")
df_rna <- df_rna %>% rownames_to_column(var = "gene")

# Merge datasets by gene
df_merged <- merge(df_sct, df_rna, by = "gene", suffixes = c("_SCT", "_RNA"))

# Scatter plot of log2FC values
ggplot(df_merged, aes(x = avg_log2FC_SCT, y = avg_log2FC_RNA)) +
  geom_point(alpha = 0.7, color = "blue") +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Log2 Fold Change Comparison: SCT vs RNA",
       x = "SCT Log2 Fold Change",
       y = "RNA Log2 Fold Change") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "black")
```


## 7.5 Find overlap between the top 100 most significant genes:
```{r, fig.height=12, fig.width=12}

cor(df_merged$avg_log2FC_SCT, df_merged$avg_log2FC_RNA, use = "complete.obs")

# Higher correlation (~0.8-1.0) suggests consistency between methods.
# Lower correlation (<0.6) may indicate different normalization effects.


library(ggplot2)
library(dplyr)

# Define a broader list of known Sezary genes
sezary_genes <- c("KIR3DL2", "TOX", "TWIST1", "CD52", "CCR4", "IL2RA", "CD7", "DPP4", "CCR7", "GATA3", "FOXP3", "CD8A", "CD4", "CYP27B1", "TET2")

# Filter the merged data to include only the known Sezary genes
df_filtered <- df_merged %>%
  filter(gene %in% sezary_genes)

# Volcano plot showing only the known Sezary genes with gene names
ggplot(df_filtered, aes(x = avg_log2FC_SCT, y = -log10(p_val_adj_SCT), color = gene)) +
  geom_point(alpha = 0.7) +
  geom_text(aes(label = gene), vjust = 1.5, hjust = 0.5, size = 3, color = "black") +  # Adding gene names
  theme_minimal() +
  labs(title = "Volcano Plot: Known Sezary Genes (SCT Assay)", 
       x = "Log2 Fold Change (SCT)", 
       y = "-Log10 Adjusted P-value") +
  scale_color_manual(values = c("KIR3DL2" = "#1f77b4",  # blue
                                "TOX" = "#ff7f0e",  # orange
                                "TWIST1" = "#2ca02c",  # green
                                "CD52" = "#9467bd",  # purple
                                "CCR4" = "#8c564b",  # brown
                                "IL2RA" = "#e377c2",  # pink
                                "CD7" = "#7f7f7f",  # gray
                                "DPP4" = "#bcbd22",  # yellow-green
                                "CCR7" = "#17becf",  # cyan
                                "GATA3" = "#d62728",  # red
                                "FOXP3" = "#8a2be2",  # blue-violet
                                "CD8A" = "#ff6347",  # tomato
                                "CD4" = "#3cb371",  # medium sea green
                                "CYP27B1" = "#ff1493",  # deep pink
                                "TET2" = "#00bfff")) +  # deep sky blue
  theme(legend.position = "top")

```