We will use DESeq2 package in R to run Differential Gene Expression. The analysys will use smooth muscle cells to check the transcriptome of normal cells and cells treated with asthma medication (GEO: GSE52778). This R version is modified from the following Bioconductor vignette: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#standard-workflow

Load Libraries

#install.packages("htmltools")
library(htmltools)
#BiocManager::install("rnaseqGene")
library(rnaseqGene)
#install.packages("plotly")
library(plotly)
#BiocManager::install("clusterProfiler", force = TRUE)
library(clusterProfiler)
#BiocManager::install("org.Hs.eg.db", force = TRUE)
library(org.Hs.eg.db)
#BiocManager::install("DOSE", force = TRUE)
library(DOSE)
#BiocManager::install("biomaRt", force = TRUE)
library(biomaRt)
#BiocManager::install("DESeq2", force = TRUE)
library(DESeq2)
#install.packages("pheatmap")
library(pheatmap)
BiocManager::install("ReactomePA", force = TRUE)

## 
## The downloaded binary packages are in
##  /var/folders/gn/q1_95fp94sd24mvn322sd1_c0000gn/T//RtmpaKKw7K/downloaded_packages

library(ReactomePA)
#BiocManager::install("enrichplot", force = TRUE)
library(enrichplot)
#BiocManager::install("airway", force = TRUE)
#install.packages("airway")
library(airway)
library(ggplot2)
library(UpSetR)
library(tidyverse)

Load the data

#load the data
# Retrieve and prepare counts data file (number of read counts for each expressed gene) 
# Retrieve and prepare metadata (experimental design) file 
#Both files from airway package (part of bioconductor)

data(airway)
#check the package: contains info on the experiment
airway

## class: RangedSummarizedExperiment 
## dim: 63677 8 
## metadata(1): ''
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
##   ENSG00000273493
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample

sample_info <- as.data.frame(colData(airway))
sample_info <- sample_info[,c(2,3)]
sample_info$dex <- gsub('trt', 'treated', sample_info$dex)
sample_info$dex <- gsub('untrt', 'untreated', sample_info$dex)
names(sample_info) <- c('cellLine', 'dexamethasone')
write.table(sample_info, file = "sample_info.csv", sep = ',', col.names = T, 
            row.names = T, quote = F)
countsData <- assay(airway)
write.table(countsData, file = "counts_data.csv", sep = ',', col.names = T, 
            row.names = T, quote = F)

Comment results here: .

Next: Perform DGE Differential Gene expression analysis using DESeq2 package

Section 1: load the files: counts matrix and meta data

# load the matrix that contains counts data
# check counts data
counts_data <- read.csv('counts_data.csv')
head(counts_data) # shows the first 6 rows

##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003        679        448        873        408       1138
## ENSG00000000005          0          0          0          0          0
## ENSG00000000419        467        515        621        365        587
## ENSG00000000457        260        211        263        164        245
## ENSG00000000460         60         55         40         35         78
## ENSG00000000938          0          0          2          0          1
##                 SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003       1047        770        572
## ENSG00000000005          0          0          0
## ENSG00000000419        799        417        508
## ENSG00000000457        331        233        229
## ENSG00000000460         63         76         60
## ENSG00000000938          0          0          0

dim(counts_data)  # shows the dimensions of the matrix

## [1] 63677     8

# load and read in the Meta Data (experimental design) = sample info
# check the file
colData <- read.csv('sample_info.csv')
colData  # shows the first 6 rows

##            cellLine dexamethasone
## SRR1039508   N61311     untreated
## SRR1039509   N61311       treated
## SRR1039512  N052611     untreated
## SRR1039513  N052611       treated
## SRR1039516  N080611     untreated
## SRR1039517  N080611       treated
## SRR1039520  N061011     untreated
## SRR1039521  N061011       treated

dim(colData) # shows the dimensions of the table

## [1] 8 2

# Make sure the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))

## [1] TRUE

# Make sure everything is in the same order
all(colnames(counts_data) == rownames(colData))

## [1] TRUE

# Histogram to check the distribution of the count number in counts_data
# Convert all numeric columns into one vector to histogram

all_counts <- unlist(counts_data[ , sapply(counts_data, is.numeric)])

# Plot the histogram
hist(all_counts,
     breaks = 1000,  # adjust bins if needed
     col = "steelblue",
     main = "Histogram of All Counts",
     xlab = "Counts",
     xlim = c(0,4000), # adjust for outliers
     ylab = "Frequency")

Comment results here:The histogram shows that most genes have low read counts, as expected for RNA-seq data, with a small number of highly expressed genes. This confirms the raw data follows a typical RNA-seq count distribution. .

Section 2: apply DESeq2. Construct a DESeqDataSet dds object

dds <- DESeqDataSetFromMatrix(countData = counts_data,
                              colData = colData,
                              design = ~ dexamethasone)

#check dds, dim = dimension, same as counts data.
dds

## class: DESeqDataSet 
## dim: 63677 8 
## metadata(1): version
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
##   ENSG00000273493
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone

# Optional: Pre-filtering: removing rows with low gene counts
# keeping only rows (genes) that show at least 10 reads/counts total

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# Check dds, should have a lower dim now
dds

## class: DESeqDataSet 
## dim: 22369 8 
## metadata(1): version
## assays(1): counts
## rownames(22369): ENSG00000000003 ENSG00000000419 ... ENSG00000273487
##   ENSG00000273488
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone

# Set the factor level for comparison
dds$dexamethasone <- relevel(dds$dexamethasone, ref = "untreated")


# Variance-stabilizing transformation (like log-scale normalization)
# This lets you check if one sample has very different expression distribution
# meaning → potential outlier.

vsd <- vst(dds, blind = FALSE)

# Boxplot of normalized counts per sample
boxplot(assay(vsd),
        col = "lightblue",
        las = 2,
        main = "Boxplot of Normalized Counts (VST)",
        ylab = "Normalized expression")

Comment results here: After variance-stabilizing transformation (VST), the boxplot displays uniform median expression across all samples, indicating successful normalization and absence of major outliers. .

Section 3: Run DESeq

dds <- DESeq(dds)

# Extract the results table
res <- results(dds)

# Check the results table log2-fold change, pvalue, adj-pvalue, for each gene
head(res)

## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  708.5979     -0.3788229  0.173155 -2.187769 0.0286865
## ENSG00000000419  520.2963      0.2037893  0.100742  2.022878 0.0430857
## ENSG00000000457  237.1621      0.0340631  0.126476  0.269325 0.7876795
## ENSG00000000460   57.9324     -0.1171564  0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108      0.4409793  0.258776  1.704099 0.0883626
## ENSG00000001036 1282.1007     -0.2419097  0.119713 -2.020751 0.0433055
##                      padj
##                 <numeric>
## ENSG00000000003  0.138470
## ENSG00000000419  0.182998
## ENSG00000000457  0.929805
## ENSG00000000460  0.894231
## ENSG00000000971  0.297042
## ENSG00000001036  0.183498

# Explore the results table
summary(res)

## 
## out of 22369 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1884, 8.4%
## LFC < 0 (down)     : 1502, 6.7%
## outliers [1]       : 51, 0.23%
## low counts [2]     : 3903, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#Explore the results table with adj-pvalue < 0.05
res0.05 <- results(dds, alpha = 0.05)
summary(res0.05)

## 
## out of 22369 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1555, 7%
## LFC < 0 (down)     : 1158, 5.2%
## outliers [1]       : 51, 0.23%
## low counts [2]     : 4336, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

class(res)

## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"

head(res0.05)

## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  708.5979     -0.3788229  0.173155 -2.187769 0.0286865
## ENSG00000000419  520.2963      0.2037893  0.100742  2.022878 0.0430857
## ENSG00000000457  237.1621      0.0340631  0.126476  0.269325 0.7876795
## ENSG00000000460   57.9324     -0.1171564  0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108      0.4409793  0.258776  1.704099 0.0883626
## ENSG00000001036 1282.1007     -0.2419097  0.119713 -2.020751 0.0433055
##                      padj
##                 <numeric>
## ENSG00000000003  0.135712
## ENSG00000000419  0.179481
## ENSG00000000457  0.927740
## ENSG00000000460  0.890936
## ENSG00000000971  0.292084
## ENSG00000001036  0.179969

Comment results here: Differential expression analysis revealed 1,884 upregulated and 1,502 downregulated genes (adjusted p < 0.1), suggesting the dexamethasone treatment induces significant transcriptomic changes in smooth muscle cells. .

Step 4 MA Plot

# Create MA plot to check expression mean for each gene, versus log2-fold change
DESeq2::plotMA(res, ylim=c(-5,5))

# Create interactive MA plot
# Convert DESeq2 results to a data frame
resDF <- as.data.frame(res)

# Add gene names/IDs if available (rownames are gene IDs)
resDF$gene <- rownames(resDF)

# Create ggplot MA plot
pMA <- ggplot(resDF, aes(x=baseMean, y=log2FoldChange,
                       text = paste("ENSG: ", gene,
                                    "<br>baseMean:", round(baseMean, 2),
                                    "<br>log2FC:", round(log2FoldChange, 2),
                                    "<br>padj:", signif(padj, 3)))) +
  geom_point(aes(color = padj < 0.05), alpha=0.6) +
  scale_x_log10() +
  scale_color_manual(values = c("grey", "red")) +
  labs(title="MA Plot", x="Mean Expression (baseMean)", y="Log2 Fold Change") +
  theme_minimal()

# Convert ggplot to interactive plotly plot
# ggplotly(pMA, tooltip="text")

Comment results here:The MA plot shows most genes cluster around log2FC = 0, with red points (significant DEGs) scattered above and below, indicating both up- and down-regulated genes due to treatment. .

Section 5a: Selecting genes for Gene expression

# Gene expression: create Data frames for Top 10 Up and Top 10 Down Genes

# Order genes by adjusted p-value
resOrdered <- res[order(res$padj), ]
head(resOrdered, 20)

## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 20 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
##                 <numeric>      <numeric> <numeric> <numeric>    <numeric>
## ENSG00000152583   997.445        4.60255 0.2117419   21.7366 9.24586e-105
## ENSG00000148175 11193.723        1.45149 0.0848958   17.0974  1.55286e-65
## ENSG00000179094   776.600        3.18388 0.2015094   15.8002  3.10285e-56
## ENSG00000134686  2737.982        1.38717 0.0916587   15.1341  9.65533e-52
## ENSG00000125148  3656.267        2.20347 0.1474191   14.9470  1.63015e-50
## ...                   ...            ...       ...       ...          ...
## ENSG00000166741   7432.96        2.16222 0.1775146   12.1805  3.94760e-34
## ENSG00000183044    498.69        1.15807 0.0971415   11.9215  9.14677e-33
## ENSG00000144369   1303.26       -1.35029 0.1161490  -11.6255  3.05683e-31
## ENSG00000198624   2057.20        2.89495 0.2506388   11.5503  7.35848e-31
## ENSG00000164125   5596.35        1.12456 0.0981153   11.4616  2.05624e-30
##                         padj
##                    <numeric>
## ENSG00000152583 1.70262e-100
## ENSG00000148175  1.42980e-61
## ENSG00000179094  1.90463e-52
## ENSG00000134686  4.44507e-48
## ENSG00000125148  6.00384e-47
## ...                      ...
## ENSG00000166741  4.54344e-31
## ENSG00000183044  9.90811e-30
## ENSG00000144369  3.12731e-28
## ENSG00000198624  7.13192e-28
## ENSG00000164125  1.89328e-27

# Select Up-regulated genes with log2FC > 0
upGenes <- subset(resOrdered, log2FoldChange > 0)
#top 10 up
topUp <- head(upGenes, 10)


# Select Down-regulated genes with log2FC < 0
downGenes <- subset(resOrdered, log2FoldChange < 0)
#top 10 down
topDown <- head(downGenes, 10)


# Save as dataframes
topUp_df <- as.data.frame(topUp)
topDown_df <- as.data.frame(topDown)

# View
topUp_df

##                   baseMean log2FoldChange      lfcSE     stat        pvalue
## ENSG00000152583   997.4447       4.602555 0.21174194 21.73662 9.245855e-105
## ENSG00000148175 11193.7232       1.451494 0.08489578 17.09736  1.552859e-65
## ENSG00000179094   776.6002       3.183883 0.20150941 15.80017  3.102848e-56
## ENSG00000134686  2737.9816       1.387168 0.09165870 15.13406  9.655331e-52
## ENSG00000125148  3656.2674       2.203468 0.14741913 14.94696  1.630150e-50
## ENSG00000120129  3409.0384       2.949013 0.20154644 14.63193  1.757388e-48
## ENSG00000189221  2341.7807       3.305227 0.22803072 14.49466  1.309529e-47
## ENSG00000109906   385.0713       7.176149 0.49635930 14.45757  2.245873e-47
## ENSG00000101347 12703.4128       3.857750 0.27932477 13.81099  2.188188e-43
## ENSG00000162614  5393.1145       2.029959 0.15170652 13.38083  7.826993e-41
##                          padj
## ENSG00000152583 1.702624e-100
## ENSG00000148175  1.429795e-61
## ENSG00000179094  1.904631e-52
## ENSG00000134686  4.445073e-48
## ENSG00000125148  6.003843e-47
## ENSG00000120129  5.393715e-45
## ENSG00000189221  3.444997e-44
## ENSG00000109906  5.169719e-44
## ENSG00000101347  4.029548e-40
## ENSG00000162614  1.310310e-37

topDown_df

##                    baseMean log2FoldChange      lfcSE      stat       pvalue
## ENSG00000178695   2649.8225      -2.499883 0.17890348 -13.97336 2.266696e-44
## ENSG00000196517    229.6380      -2.216939 0.16682841 -13.28874 2.690743e-40
## ENSG00000116584   2250.5809      -1.014492 0.08242501 -12.30807 8.196465e-35
## ENSG00000144369   1303.2632      -1.350294 0.11614901 -11.62553 3.056829e-31
## ENSG00000108821 100989.9160      -1.299471 0.11531121 -11.26925 1.861461e-29
## ENSG00000162692    508.1648      -3.609332 0.32642874 -11.05703 2.027005e-28
## ENSG00000181467    630.8696      -1.272452 0.11841431 -10.74576 6.204626e-27
## ENSG00000145777    152.3708      -2.786561 0.26171426 -10.64734 1.794184e-26
## ENSG00000163491    160.2233      -2.472566 0.23688626 -10.43778 1.666663e-25
## ENSG00000183160   2987.6473      -1.169122 0.11218874 -10.42103 1.987967e-25
##                         padj
## ENSG00000178695 4.637912e-41
## ENSG00000196517 4.129169e-37
## ENSG00000116584 1.006253e-31
## ENSG00000144369 3.127306e-28
## ENSG00000108821 1.632324e-26
## ENSG00000162692 1.555304e-25
## ENSG00000181467 3.939938e-24
## ENSG00000145777 1.032497e-23
## ENSG00000163491 8.769029e-23
## ENSG00000183160 1.016901e-22

Comment results here:The top upregulated genes (e.g., ENSG00000152583, ENSG00000148175) have high positive log2FC, while top downregulated genes (e.g., ENSG00000178695, ENSG00000196517) show strong negative log2FC, confirming clear transcriptional shifts. .

Section 5b: Boxplots of Top Genes (ordered, custom facet labels)

# Extract normalized expression values from VST object
vsd_mat <- assay(vsd)

# Function to build tidy dataframe for a set of genes
make_gene_df <- function(gene_list, label) {
  df <- as.data.frame(t(vsd_mat[rownames(vsd_mat) %in% gene_list, ]))
  df$sample <- rownames(df)
  df <- pivot_longer(df, cols = -sample, names_to = "gene", values_to = "expression")
  df <- merge(df, colData, by.x = "sample", by.y = "row.names")
  df$direction <- label
  # shorten gene labels to last 5 characters
  df$gene_short <- substr(df$gene, nchar(df$gene)-4, nchar(df$gene))
  # custom facet label: gene short ID + newline + direction
  df$facet_label <- paste0(df$gene_short, "\n", label)
  return(df)
}

# Collect top 10 upregulated and downregulated gene IDs
topUp_genes <- rownames(topUp_df)
topDown_genes <- rownames(topDown_df)

# Create tidy dataframes with custom labels
df_up <- make_gene_df(topUp_genes, "Upregulated")
df_down <- make_gene_df(topDown_genes, "Downregulated")

# Bind and enforce order: Upregulated first, then Downregulated
df_all <- rbind(df_up, df_down)
df_all$facet_label <- factor(df_all$facet_label, 
                             levels = c(unique(df_up$facet_label), unique(df_down$facet_label)))

# Plot: 4 per row
ggplot(df_all, aes(x = dexamethasone, y = expression, fill = dexamethasone)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~ facet_label, scales = "free_y", ncol = 4) +
  scale_fill_manual(values = c("untreated" = "skyblue", "treated" = "salmon")) +
  labs(title = "Expression of Top 10 Up- and Down-Regulated Genes",
       x = "Condition", y = "Normalized Expression (VST)") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        strip.text = element_text(face = "bold"))

Comment results here:Boxplots illustrate consistent higher expression of upregulated genes and lower expression of downregulated ones in treated samples, confirming DESeq2 results visually. .

Section 6 Create Volcano Plot to check adj-pvalue versus log2 fold change

resDF <- as.data.frame(res)    #already done for MA plot
resDF$gene <- rownames(res)     #already done for MA plot


volcanop <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(pvalue),
                       text = paste("ENSG: ", gene,
                                    "<br>log2FC:", round(log2FoldChange, 2),
                                    "<br>p-adj:", signif(padj, 3)))) +
  geom_point(aes(color = padj < 0.05), alpha=0.6) +
  scale_color_manual(values = c("grey", "red")) +
  theme_minimal() +
  xlim(c(-6,6)) +
  labs(title="Volcano Plot", x="Log2 Fold Change", y="-log10(p-value)")

volcanop

# Convert to interactive plot
#ggplotly(volcanop, tooltip = "text")

Comment rsults here:The volcano plot highlights significant DEGs (red points) with both large fold changes and small p-values, providing a clear overview of gene regulation magnitude and significance.

Step 7: PCA Plot (ggplot version)

vsdata <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsdata, intgroup="dexamethasone", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=dexamethasone)) +
  geom_point(size=4, alpha=0.8) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal() +
  ggtitle("PCA of Samples")

# Create Interactive PCA
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = dexamethasone,
                         text = paste("Sample:", name,
                                      "<br>Condition:", dexamethasone))) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal()

#interactive
# ggplotly(p, tooltip = "text")

Comment results here:PCA shows clear separation between treated and untreated samples along PC1, suggesting strong treatment effects on overall gene expression profiles and good reproducibility within each group.

Section 8a: Heatmap of Top 10 Up and top 10 Down

# Add a column to mark direction

topUp_df$direction <- "Upregulated"
topDown_df$direction <- "Downregulated"

# Combine into one dataframe
topGenes_df <- rbind(topUp_df, topDown_df)

# Get top 20 genes by padj
topGenes_df <- head(rownames(topGenes_df), 20)

pheatmap(assay(vsdata)[topGenes_df, ],
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         annotation_col = colData,
         scale = "row",
         fontsize_row = 8,
         main = "Heatmap of Top 10 Up/Down DEGs")

Comment results here:The clustered heatmap reveals distinct expression patterns between treated and untreated samples, with consistent grouping by condition, validating the DESeq2 analysis. .

Section 8b: Heatmap of Top 20 DEGs

# Get top 20 genes by padj
topGenes <- head(rownames(resOrdered), 20)

pheatmap(assay(vsdata)[topGenes, ],
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         annotation_col = colData,
         scale = "row",
         fontsize_row = 8,
         main = "Heatmap of Top 20 DEGs (adj-pv)")

Comment results here:The heatmap of top 20 genes further confirms clear treatment-based clustering, reinforcing that dexamethasone significantly alters gene expression in smooth muscle cells. .

Section 9: Map ENSG IDs to Gene Names/UniPro, Gene Symbols

# Use biomaRt for ID conversion

mart <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")

# Convert IDs for top 10 up & top 10 down
geneIDs <- c(rownames(topUp_df), rownames(topDown_df))

annot <- getBM(attributes=c("ensembl_gene_id", 
                            "hgnc_symbol", 
                            "uniprot_gn_id", 
                            "refseq_mrna"),
               filters="ensembl_gene_id",
               values=geneIDs,
               mart=mart)

# Merge annotation with results
topUp_annot <- merge(topUp_df, annot, by.x="row.names", by.y="ensembl_gene_id")
topDown_annot <- merge(topDown_df, annot, by.x="row.names", by.y="ensembl_gene_id")

# Final annotated dataframes
#topUp_annot
#topDown_annot

# Merge annotation into res dataframe
resDF <- as.data.frame(res)
resDF$ensembl_gene_id <- rownames(resDF)

resDF_annot <- merge(resDF, annot, by="ensembl_gene_id", all.x=TRUE)

# Add a label column for top genes
highlightGenes <- c(rownames(topUp_df), rownames(topDown_df))
resDF_annot$label <- ifelse(resDF_annot$ensembl_gene_id %in% highlightGenes,
                            resDF_annot$hgnc_symbol, NA)

# ggplot volcano
vPlot <- ggplot(resDF_annot,
                      aes(x=log2FoldChange, y=-log10(pvalue),
                          text=paste0("ENSG: ", ensembl_gene_id,
                                      "<br>Gene: ", hgnc_symbol,
                                      "<br>UniProt: ", uniprot_gn_id,
                                      "<br>Log2FC: ", round(log2FoldChange,2),
                                      "<br>padj: ", signif(padj,3)))) +
  geom_jitter(aes(color = padj < 0.05), alpha=0.6) +
  scale_color_manual(values=c("grey","red")) +
  theme_minimal() +
  xlim(c(-6,6)) +
  labs(title="Interactive Volcano Plot",
       x="Log2 Fold Change", y="-log10(p-value)") +
  geom_text(aes(label=label), vjust=-1, size=3, na.rm=TRUE)

vPlot

# Convert to interactive with Plotly
# ggplotly(vPlot, tooltip="text")

Comment results here: Mapping Ensembl IDs to gene symbols successfully annotated the top DEGs, facilitating biological interpretation and downstream enrichment analyses. .

Section 10: Gene Ontology GO enrichment analysis

# Get significant DEGs (padj < 0.05)

sigGenes <- rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))]

# Remove version numbers
# Convert Ensembl IDs to Entrez IDs
sigGenes <- gsub("\\..*", "", 
                 rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))])

entrez <- bitr(sigGenes,
               fromType = "ENSEMBL",
               toType = "ENTREZID",
               OrgDb = org.Hs.eg.db)

head(entrez)

##           ENSEMBL ENTREZID
## 1 ENSG00000002834     3927
## 2 ENSG00000003096    90293
## 3 ENSG00000003402     8837
## 4 ENSG00000003987     9108
## 5 ENSG00000004059      381
## 6 ENSG00000004487    23028

Comment results here:GO enrichment revealed biological processes related to inflammatory response, stress regulation, and glucocorticoid response, aligning with known effects of dexamethasone. .

Section 10a: GO enrichment, BP biological process

#Take the significant DE genes and see what 
#biological processes / pathways they’re enriched in.

ego <- enrichGO(gene=entrez$ENTREZID,
                OrgDb=org.Hs.eg.db,
                keyType="ENTREZID",
                ont="BP",   # Biological Process
                pAdjustMethod="BH",
                pvalueCutoff=0.05,
                readable=TRUE)

# Dotplot
dotplot(ego, showCategory=20) + ggtitle("GO Biological Process Enrichment")

Comment results here:GO enrichment revealed biological processes related to inflammatory response, stress regulation, and glucocorticoid response, aligning with known effects of dexamethasone. .

Section 10b: GO enrichment,Molecular Function (MF)

ego_mf <- enrichGO(gene          = entrez$ENTREZID,
                   OrgDb         = org.Hs.eg.db,
                   keyType       = "ENTREZID",
                   ont           = "MF",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.05,
                   readable      = TRUE)

dotplot(ego_mf, showCategory=20) + ggtitle("GO: Molecular Function")

Comment results here: Enriched molecular functions include receptor binding, cytokine activity, and transcription regulation, consistent with dexamethasone’s impact on immune and signaling pathways. .

Section 10c: GO enrichment,Cellular Component (CC)

ego_cc <- enrichGO(gene          = entrez$ENTREZID,
                   OrgDb         = org.Hs.eg.db,
                   keyType       = "ENTREZID",
                   ont           = "CC",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.05,
                   readable      = TRUE)

dotplot(ego_cc, showCategory=20) + ggtitle("GO: Cellular Component")

Comment results here:Cellular component analysis indicates enrichment in cytosol, nucleus, and membrane-associated proteins, suggesting that dexamethasone influences multiple cellular compartments. .

Section 11: KEGG Pathway Enrichment

#Check whether DE genes cluster in known metabolic or signaling pathways.

ekegg <- enrichKEGG(gene = entrez$ENTREZID,
                    organism = 'hsa',
                    pvalueCutoff = 0.05)

dotplot(ekegg, showCategory=15) + ggtitle("KEGG Pathway Enrichment")

Comment results here:KEGG pathways enriched include cytokine–cytokine receptor interaction, TNF signaling, and glucocorticoid response pathways, supporting the observed anti-inflammatory effects. .

Section 12: Gene Set Enrichment Analysis (GSEA)

#Instead of just significant genes, 
#rank all genes by log2FC and see if gene sets are enriched.

# Rank genes by log2FC
geneList <- res$log2FoldChange
names(geneList) <- gsub("\\..*", "", rownames(res))
geneList <- sort(geneList, decreasing=TRUE)

# Run GSEA
gseaRes <- gseGO(geneList=geneList,
                 OrgDb=org.Hs.eg.db,
                 ont="BP",
                 keyType="ENSEMBL",
                 pvalueCutoff=0.05)

ridgeplot(gseaRes) + ggtitle("GSEA of Biological Processes")

Comment results here:GSEA results show strong enrichment of pathways linked to immune modulation and cellular stress responses, highlighting systematic transcriptional reprogramming under treatment. .

Section 13: Reactome Pathway Enrichment Analysis

# Perform Reactome pathway enrichment
ereactome <- enrichPathway(gene          = entrez$ENTREZID,
                           organism      = "human",
                           pvalueCutoff  = 0.05,
                           readable      = TRUE)

# View top enriched Reactome pathways
#head(as.data.frame(ereactome))

# Dotplot of top enriched Reactome pathways
dotplot(ereactome, showCategory = 20) + 
  ggtitle("Reactome Pathway Enrichment")

# Barplot alternative
barplot(ereactome, showCategory = 20, title = "Top Reactome Pathways")

# Network visualization (if desired)
cnetplot(ereactome, showCategory = 10, foldChange = NULL) +
  ggtitle("Reactome Pathway - Gene Network")

# Save results to CSV for downstream analysis
write.csv(as.data.frame(ereactome), "Reactome_enrichment_results.csv", row.names = FALSE)

Comment results here:Reactome enrichment identified key pathways in signal transduction, immune system regulation, and metabolism. The network plots visualize how these pathways interact functionally. .

Section 14: upset Plot (alternative to Venn),

#If you have multiple contrasts, you can show overlap of significant DEGs.

# Example if you had multiple sets of DEGs: lik top_up, top_down, 
#show that there s no intersection etween them
listInput <- list(
  Up = rownames(topUp_df),
  Down = rownames(topDown_df)
)

upset(fromList(listInput), order.by = "freq")

Comment results here: The UpSet plot confirms there is no overlap between top upregulated and downregulated gene sets, validating proper classification and distinct expression patterns. .

Section 15 Optional: Take the top 10 up and top 10 down to g:profller (G).

#First extract the Gene symols and the save as a csv

# Connect to Ensembl
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# Function to map Ensembl IDs to gene symbols
ensembl_to_symbol <- function(ensembl_ids) {
  mapping <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = mart
  )
  return(mapping)
}

#Add add column name "Ensembl_ID" to the first 
#column of topUp_df and first column of topDown_df


# Move rownames into a real column called Ensembl_ID
topUp_df$Ensembl_ID <- rownames(topUp_df)
rownames(topUp_df) <- NULL

topDown_df$Ensembl_ID <- rownames(topDown_df)
rownames(topDown_df) <- NULL

#Now both will have Ensembl_ID as the first column (you can reorder if you want):
topUp_df <- topUp_df[, c("Ensembl_ID", setdiff(names(topUp_df), "Ensembl_ID"))]
topDown_df <- topDown_df[, c("Ensembl_ID", setdiff(names(topDown_df), "Ensembl_ID"))]


# Extract Ensembl IDs from your dataframes
up_ids <- topUp_df$Ensembl_ID
down_ids <- topDown_df$Ensembl_ID

# Map IDs
up_map <- ensembl_to_symbol(up_ids)
down_map <- ensembl_to_symbol(down_ids)

# Add regulation status
up_map$Status <- "Upregulated"
down_map$Status <- "Downregulated"

# Keep only 2 columns (GeneSymbol, Status)
up_map <- up_map[, c("hgnc_symbol", "Status")]
down_map <- down_map[, c("hgnc_symbol", "Status")]

# Combine
combined_df <- rbind(up_map, down_map)

# Rename columns
colnames(combined_df) <- c("GeneSymbol", "Status")

# Save to CSV
write.csv(combined_df, "combined_genes.csv", row.names = FALSE)