#Load Seurat Object L7
load("../0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")
All_samples_Merged
An object of class Seurat
62900 features across 49305 samples within 6 assays
Active assay: SCT (26176 features, 3000 variable features)
3 layers present: counts, data, scale.data
5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
5 dimensional reductions calculated: integrated_dr, ref.umap, pca, umap, harmony
# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"
DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line",label = T, label.box = T)
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T)
# Set the default assay to SCT
DefaultAssay(All_samples_Merged) <- "SCT"
markers_mast_SCT <- FindMarkers(
All_samples_Merged,
ident.1 = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"), # Cell lines-Malignant
ident.2 = c("3", "10"), # Normal CD4 T cells
assay = "SCT",
test.use = "MAST",
latent.vars = c("cell_line") # Adjust for batch effects
)
Warning in asMethod(object): sparse->dense coercion: allocating vector of size
5.3 GiB
Warning in new_with_repaired_slots(classname = method, design = colData(sca), : Dropping illegal slot(s) norm.method for class BayesGLMlike.
This likely indicates a bug in an upstream package.
Done!
Combining coefficients and standard errors
Calculating log-fold changes
Calculating likelihood ratio tests
Refitting on reduced model...
Done!
write.csv(markers_mast_SCT, file = "../0-robj/RNA_SCT_test_default_pct_fc_parameters/1-MAST_with_SCT_batch_cellline_as_Covariate.csv", row.names = TRUE)
library(magrittr)
Attaching package: 'magrittr'
The following object is masked from 'package:GenomicRanges':
subtract
# Set the default assay to RNA
DefaultAssay(All_samples_Merged) <- "RNA"
# Normalize the RNA assay but DO NOT scale the data
All_samples_Merged <- All_samples_Merged %>%
NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000)
Normalizing layer: counts
markers_mast_RNA <- FindMarkers(
All_samples_Merged,
ident.1 = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13"), # Cell lines-Malignant
ident.2 = c("3", "10"), # Normal CD4 T cells
assay = "RNA",
test.use = "MAST",
latent.vars = c("cell_line") # Adjust for batch effects
)
Warning in asMethod(object): sparse->dense coercion: allocating vector of size
5.1 GiB
Done!
Combining coefficients and standard errors
Calculating log-fold changes
Calculating likelihood ratio tests
Refitting on reduced model...
Done!
write.csv(markers_mast_RNA, file = "../0-robj/RNA_SCT_test_default_pct_fc_parameters/2-MAST_with_RNA_batch_cellline_as_Covariate.csv", row.names = TRUE)
library(Seurat)
library(dplyr)
# Define cell groups
group1_cells <- WhichCells(All_samples_Merged, idents = c("5", "9", "1", "2", "6", "8", "4", "0", "7", "11", "12", "13")) # Malignant CD4+ T cells
group2_cells <- WhichCells(All_samples_Merged, idents = c("3", "10")) # Normal CD4+ T cells
# Extract normalized expression values for both assays
expression_data_RNA <- GetAssayData(All_samples_Merged, assay = "RNA", layer = "data") # Log-normalized counts
expression_data_SCT <- GetAssayData(All_samples_Merged, assay = "SCT", layer = "data") # SCT-normalized counts (not Pearson residuals)
# Function to calculate mean expression for each group
calculate_mean_expression <- function(markers, group1_cells, group2_cells, expression_data) {
group1_mean <- rowMeans(expression_data[, group1_cells, drop = FALSE], na.rm = TRUE)
group2_mean <- rowMeans(expression_data[, group2_cells, drop = FALSE], na.rm = TRUE)
markers <- markers %>%
rownames_to_column("gene") %>%
mutate(mean_expr_group1 = group1_mean[gene],
mean_expr_group2 = group2_mean[gene])
return(markers)
}
# Compute mean expression for RNA and SCT assays
markers_mast_RNA <- calculate_mean_expression(markers_mast_RNA, group1_cells, group2_cells, expression_data_RNA)
markers_mast_SCT <- calculate_mean_expression(markers_mast_SCT, group1_cells, group2_cells, expression_data_SCT)
# Save updated results with mean expression
write.csv(markers_mast_SCT, file = "../0-robj/RNA_SCT_test_default_pct_fc_parameters/1-MAST_with_SCT_batch_cellline_as_Covariate_with_meanExpression.csv", row.names = TRUE)
write.csv(markers_mast_RNA, file = "../0-robj/RNA_SCT_test_default_pct_fc_parameters/2-MAST_with_RNA_batch_cellline_as_Covariate_with_meanExpression.csv", row.names = TRUE)
summarize_markers <- function(markers) {
num_pval0 <- sum(markers$p_val_adj == 0)
num_pval1 <- sum(markers$p_val_adj == 1)
num_significant <- sum(markers$p_val_adj < 0.05)
num_upregulated <- sum(markers$avg_log2FC > 1)
num_downregulated <- sum(markers$avg_log2FC < -1)
cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
cat("Number of upregulated genes (avg_log2FC > 1):", num_upregulated, "\n")
cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
}
cat("Markers1 Summary (MAST with SCT Assay):\n")
Markers1 Summary (MAST with SCT Assay):
summarize_markers(markers_mast_SCT)
Number of genes with p_val_adj = 0: 72
Number of genes with p_val_adj = 1: 8299
Number of significant genes (p_val_adj < 0.05): 5084
Number of upregulated genes (avg_log2FC > 1): 9917
Number of downregulated genes (avg_log2FC < -1): 789
cat("Markers2 Summary (MAST with RNA Assay):\n")
Markers2 Summary (MAST with RNA Assay):
summarize_markers(markers_mast_RNA)
Number of genes with p_val_adj = 0: 78
Number of genes with p_val_adj = 1: 4853
Number of significant genes (p_val_adj < 0.05): 8198
Number of upregulated genes (avg_log2FC > 1): 4183
Number of downregulated genes (avg_log2FC < -1): 2606
### Compare Results Between RNA and SCT Assays
# Load gene sets
genes_mast_sct <- markers_mast_SCT$gene
genes_mast_rna <- markers_mast_RNA$gene
common_genes_mast <- intersect(genes_mast_sct, genes_mast_rna)
cat("MAST: Common Genes Between SCT and RNA:", length(common_genes_mast), "\n")
MAST: Common Genes Between SCT and RNA: 13478
EnhancedVolcano(markers_mast_SCT,
lab = markers_mast_SCT$gene,
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant_vs_Normal_MAST_SCT",
pCutoff = 0.05,
FCcutoff = 1.0)
Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
non-zero p-value...
EnhancedVolcano(markers_mast_RNA,
lab = markers_mast_RNA$gene,
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant_vs_Normal_MAST_RNA",
pCutoff = 0.05,
FCcutoff = 1.0)
Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
non-zero p-value...
### 1. Load Data and Prepare for Comparison
library(ggplot2)
df_sct <- read.csv("../0-robj/RNA_SCT_test_default_pct_fc_parameters/1-MAST_with_SCT_batch_cellline_as_Covariate.csv", row.names = 1)
df_rna <- read.csv("../0-robj/RNA_SCT_test_default_pct_fc_parameters/2-MAST_with_RNA_batch_cellline_as_Covariate.csv", row.names = 1)
df_sct$assay <- "SCT"
df_rna$assay <- "RNA"
df_combined <- rbind(df_sct, df_rna)
### 2. Volcano Plot for SCT vs RNA Comparison
ggplot(df_combined, aes(x = avg_log2FC, y = -log10(p_val_adj), color = assay)) +
geom_point(alpha = 0.7) +
theme_minimal() +
labs(title = "Volcano Plot: RNA vs SCT", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
scale_color_manual(values = c("SCT" = "blue", "RNA" = "red"))
# Correlation of Log2 Fold Change Between RNA and SCT
# Goal: See if genes have consistent fold changes between RNA and SCT normalization.
library(dplyr) # Load dplyr to use rownames_to_column
# Convert row names to a "gene" column
df_sct <- df_sct %>% rownames_to_column(var = "gene")
df_rna <- df_rna %>% rownames_to_column(var = "gene")
# Merge datasets by gene
df_merged <- merge(df_sct, df_rna, by = "gene", suffixes = c("_SCT", "_RNA"))
# Scatter plot of log2FC values
ggplot(df_merged, aes(x = avg_log2FC_SCT, y = avg_log2FC_RNA)) +
geom_point(alpha = 0.7, color = "blue") +
geom_smooth(method = "lm", color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Log2 Fold Change Comparison: SCT vs RNA",
x = "SCT Log2 Fold Change",
y = "RNA Log2 Fold Change") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "black")
`geom_smooth()` using formula = 'y ~ x'
cor(df_merged$avg_log2FC_SCT, df_merged$avg_log2FC_RNA, use = "complete.obs")
[1] 0.9923688
# Higher correlation (~0.8-1.0) suggests consistency between methods.
# Lower correlation (<0.6) may indicate different normalization effects.
library(ggplot2)
library(dplyr)
# Define a broader list of known Sezary genes
sezary_genes <- c("KIR3DL2", "TOX", "TWIST1", "CD52", "CCR4", "IL2RA", "CD7", "DPP4", "CCR7", "GATA3", "FOXP3", "CCR7", "CD70", "PLS3", "PDCD1")
# Filter the merged data to include only the known Sezary genes
df_filtered <- df_merged %>%
filter(gene %in% sezary_genes)
# Volcano plot showing only the known Sezary genes with gene names
ggplot(df_filtered, aes(x = avg_log2FC_SCT, y = -log10(p_val_adj_SCT), color = gene)) +
geom_point(alpha = 0.7) +
geom_text(aes(label = gene), vjust = 1.5, hjust = 0.5, size = 3, color = "black") + # Adding gene names
theme_minimal() +
labs(title = "Volcano Plot: Known Sezary Genes (SCT Assay)",
x = "Log2 Fold Change (SCT)",
y = "-Log10 Adjusted P-value") +
scale_color_manual(values = c(
"KIR3DL2" = "#1f77b4", # blue
"TOX" = "#ff7f0e", # orange
"TWIST1" = "#2ca02c", # green
"CD52" = "#9467bd", # purple
"CCR4" = "#8c564b", # brown
"IL2RA" = "#e377c2", # pink
"CD7" = "#7f7f7f", # gray
"DPP4" = "#bcbd22", # yellow-green
"CCR7" = "#17becf", # cyan
"GATA3" = "#d62728", # red
"FOXP3" = "#8a2be2", # blue-violet
"DNM3" = "#ff6347", # tomato
"NCR1" = "#3cb371", # medium sea green
"CD70" = "#ff1493", # deep pink
"AIRE" = "#00bfff", # deep sky blue
"PDCD1 (PD-1)" = "#4c72b0", # steel blue
"CD274 (PD-L1)" = "#c5b0d5" # light purple
)) +
theme(legend.position = "top")
library(ggplot2)
library(dplyr)
# Define the list of known Sezary genes
sezary_genes <- c("KIR3DL2", "TOX", "TWIST1", "CD52", "CCR4", "IL2RA", "CD7", "DPP4", "CCR7", "GATA3", "FOXP3", "CCR7", "CD70", "PLS3", "PDCD1")
# Filter the merged data to include only the known Sezary genes for RNA data
df_filtered_rna <- df_merged %>%
filter(gene %in% sezary_genes)
# Volcano plot showing only the known Sezary genes for RNA assay with gene names
ggplot(df_filtered_rna, aes(x = avg_log2FC_RNA, y = -log10(p_val_adj_RNA), color = gene)) +
geom_point(alpha = 0.7) +
geom_text(aes(label = gene), vjust = 1.5, hjust = 0.5, size = 3, color = "black") + # Adding gene names
theme_minimal() +
labs(title = "Volcano Plot: Known Sezary Genes (RNA Assay)",
x = "Log2 Fold Change (RNA)",
y = "-Log10 Adjusted P-value") +
scale_color_manual(values = c(
"KIR3DL2" = "#1f77b4", # blue
"TOX" = "#ff7f0e", # orange
"TWIST1" = "#2ca02c", # green
"CD52" = "#9467bd", # purple
"CCR4" = "#8c564b", # brown
"IL2RA" = "#e377c2", # pink
"CD7" = "#7f7f7f", # gray
"DPP4" = "#bcbd22", # yellow-green
"CCR7" = "#17becf", # cyan
"GATA3" = "#d62728", # red
"FOXP3" = "#8a2be2", # blue-violet
"DNM3" = "#ff6347", # tomato
"NCR1" = "#3cb371", # medium sea green
"CD70" = "#ff1493", # deep pink
"AIRE" = "#00bfff", # deep sky blue
"PDCD1 (PD-1)" = "#4c72b0", # steel blue
"CD274 (PD-L1)" = "#c5b0d5" # light purple
)) +
theme(legend.position = "top")