We will use DESeq2 package in R to run Differential Gene Expression. The analysys will use smooth muscle cells to check the transcriptome of normal cells and cells treated with asthma medication (GEO: GSE52778). This R version is modified from the following Bioconductor vignette: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#standard-workflow

Load Libraries

#install.packages("htmltools")
library(htmltools)
#BiocManager::install("rnaseqGene")
library(rnaseqGene)
#install.packages("plotly")
library(plotly)
#BiocManager::install("clusterProfiler", force = TRUE)
library(clusterProfiler)
#BiocManager::install("org.Hs.eg.db", force = TRUE)
library(org.Hs.eg.db)
#BiocManager::install("DOSE", force = TRUE)
library(DOSE)
#BiocManager::install("biomaRt", force = TRUE)
library(biomaRt)
#BiocManager::install("DESeq2", force = TRUE)
library(DESeq2)
#install.packages("pheatmap")
library(pheatmap)
#BiocManager::install("ReactomePA", force = TRUE)
library(ReactomePA)
#BiocManager::install("enrichplot", force = TRUE)
library(enrichplot)
#BiocManager::install("airway", force = TRUE)
#install.packages("airway")
library(airway)
library(ggplot2)
library(UpSetR)
library(tidyverse)

Load the data

#load the data
# Retrieve and prepare counts data file (number of read counts for each expressed gene) 
# Retrieve and prepare metadata (experimental design) file 
#Both files from airway package (part of bioconductor)

data(airway)
#check the package: contains info on the experiment
airway

## class: RangedSummarizedExperiment 
## dim: 63677 8 
## metadata(1): ''
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
##   ENSG00000273493
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample

sample_info <- as.data.frame(colData(airway))
sample_info <- sample_info[,c(2,3)]
sample_info$dex <- gsub('trt', 'treated', sample_info$dex)
sample_info$dex <- gsub('untrt', 'untreated', sample_info$dex)
names(sample_info) <- c('cellLine', 'dexamethasone')
write.table(sample_info, file = "sample_info.csv", sep = ',', col.names = T, 
            row.names = T, quote = F)
countsData <- assay(airway)
write.table(countsData, file = "counts_data.csv", sep = ',', col.names = T, 
            row.names = T, quote = F)

Comment results here: two files are are counts_data.csv and the meta data us sample_info.csv .

Next: Perform DGE Differential Gene expression analysis using DESeq2 package

Section 1: load the files: counts matrix and meta data

# load the matrix that contains counts data
# check counts data
counts_data <- read.csv('counts_data.csv')
head(counts_data) # shows the first 6 rows

##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003        679        448        873        408       1138
## ENSG00000000005          0          0          0          0          0
## ENSG00000000419        467        515        621        365        587
## ENSG00000000457        260        211        263        164        245
## ENSG00000000460         60         55         40         35         78
## ENSG00000000938          0          0          2          0          1
##                 SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003       1047        770        572
## ENSG00000000005          0          0          0
## ENSG00000000419        799        417        508
## ENSG00000000457        331        233        229
## ENSG00000000460         63         76         60
## ENSG00000000938          0          0          0

dim(counts_data)  # shows the dimensions of the matrix

## [1] 63677     8

# load and read in the Meta Data (experimental design) = sample info
# check the file
colData <- read.csv('sample_info.csv')
colData  # shows the first 6 rows

##            cellLine dexamethasone
## SRR1039508   N61311     untreated
## SRR1039509   N61311       treated
## SRR1039512  N052611     untreated
## SRR1039513  N052611       treated
## SRR1039516  N080611     untreated
## SRR1039517  N080611       treated
## SRR1039520  N061011     untreated
## SRR1039521  N061011       treated

dim(colData) # shows the dimensions of the table

## [1] 8 2

# Make sure the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))

## [1] TRUE

# Make sure everything is in the same order
all(colnames(counts_data) == rownames(colData))

## [1] TRUE

# Histogram to check the distribution of the count number in counts_data
# Convert all numeric columns into one vector to histogram

all_counts <- unlist(counts_data[ , sapply(counts_data, is.numeric)])

# Plot the histogram
hist(all_counts,
     breaks = 1000,  # adjust bins if needed
     col = "steelblue",
     main = "Histogram of All Counts",
     xlab = "Counts",
     xlim = c(0,4000), # adjust for outliers
     ylab = "Frequency")

Comment results here: The histogram shows good results and it shows most of the dana sequences are in between 500 counts. .

Section 2: apply DESeq2. Construct a DESeqDataSet dds object

dds <- DESeqDataSetFromMatrix(countData = counts_data,
                              colData = colData,
                              design = ~ dexamethasone)

#check dds, dim = dimension, same as counts data.
dds

## class: DESeqDataSet 
## dim: 63677 8 
## metadata(1): version
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
##   ENSG00000273493
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone

# Optional: Pre-filtering: removing rows with low gene counts
# keeping only rows (genes) that show at least 10 reads/counts total

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# Check dds, should have a lower dim now
dds

## class: DESeqDataSet 
## dim: 22369 8 
## metadata(1): version
## assays(1): counts
## rownames(22369): ENSG00000000003 ENSG00000000419 ... ENSG00000273487
##   ENSG00000273488
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone

# Set the factor level for comparison
dds$dexamethasone <- relevel(dds$dexamethasone, ref = "untreated")


# Variance-stabilizing transformation (like log-scale normalization)
# This lets you check if one sample has very different expression distribution
# meaning → potential outlier.

vsd <- vst(dds, blind = FALSE)

# Boxplot of normalized counts per sample
boxplot(assay(vsd),
        col = "lightblue",
        las = 2,
        main = "Boxplot of Normalized Counts (VST)",
        ylab = "Normalized expression")

Comment results here:The DESeqDataSet was created successfully after removing genes with very low counts. The boxplot of normalized counts showed consistent distributions across all samples after the variance-stabilizing transformation, suggesting that normalization worked well and there are no strong outliers between samples. .

Section 3: Run DESeq

dds <- DESeq(dds)

# Extract the results table
res <- results(dds)

# Check the results table log2-fold change, pvalue, adj-pvalue, for each gene
head(res)

## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  708.5979     -0.3788229  0.173155 -2.187769 0.0286865
## ENSG00000000419  520.2963      0.2037893  0.100742  2.022878 0.0430857
## ENSG00000000457  237.1621      0.0340631  0.126476  0.269325 0.7876795
## ENSG00000000460   57.9324     -0.1171564  0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108      0.4409793  0.258776  1.704099 0.0883626
## ENSG00000001036 1282.1007     -0.2419097  0.119713 -2.020751 0.0433055
##                      padj
##                 <numeric>
## ENSG00000000003  0.138470
## ENSG00000000419  0.182998
## ENSG00000000457  0.929805
## ENSG00000000460  0.894231
## ENSG00000000971  0.297042
## ENSG00000001036  0.183498

# Explore the results table
summary(res)

## 
## out of 22369 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1884, 8.4%
## LFC < 0 (down)     : 1502, 6.7%
## outliers [1]       : 51, 0.23%
## low counts [2]     : 3903, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

#Explore the results table with adj-pvalue < 0.05
res0.05 <- results(dds, alpha = 0.05)
summary(res0.05)

## 
## out of 22369 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1555, 7%
## LFC < 0 (down)     : 1158, 5.2%
## outliers [1]       : 51, 0.23%
## low counts [2]     : 4336, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

class(res)

## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"

head(res0.05)

## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  708.5979     -0.3788229  0.173155 -2.187769 0.0286865
## ENSG00000000419  520.2963      0.2037893  0.100742  2.022878 0.0430857
## ENSG00000000457  237.1621      0.0340631  0.126476  0.269325 0.7876795
## ENSG00000000460   57.9324     -0.1171564  0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108      0.4409793  0.258776  1.704099 0.0883626
## ENSG00000001036 1282.1007     -0.2419097  0.119713 -2.020751 0.0433055
##                      padj
##                 <numeric>
## ENSG00000000003  0.135712
## ENSG00000000419  0.179481
## ENSG00000000457  0.927740
## ENSG00000000460  0.890936
## ENSG00000000971  0.292084
## ENSG00000001036  0.179969

Comment results here:The DESeq analysis completed successfully, producing log2 fold changes and adjusted p-values for each gene. The summary indicated a significant number of genes were differentially expressed at an adjusted p-value cutoff of 0.05. This confirms the treatment with dexamethasone had a measurable impact on gene expression. .

Step 4 MA Plot

# Create MA plot to check expression mean for each gene, versus log2-fold change
DESeq2::plotMA(res, ylim=c(-5,5))

# Create interactive MA plot
# Convert DESeq2 results to a data frame
resDF <- as.data.frame(res)

# Add gene names/IDs if available (rownames are gene IDs)
resDF$gene <- rownames(resDF)

# Create ggplot MA plot
pMA <- ggplot(resDF, aes(x=baseMean, y=log2FoldChange,
                       text = paste("ENSG: ", gene,
                                    "<br>baseMean:", round(baseMean, 2),
                                    "<br>log2FC:", round(log2FoldChange, 2),
                                    "<br>padj:", signif(padj, 3)))) +
  geom_point(aes(color = padj < 0.05), alpha=0.6) +
  scale_x_log10() +
  scale_color_manual(values = c("grey", "red")) +
  labs(title="MA Plot", x="Mean Expression (baseMean)", y="Log2 Fold Change") +
  theme_minimal()

# Convert ggplot to interactive plotly plot
# ggplotly(pMA, tooltip="text")

Comment results here: The MA plot showed most genes centered around zero log2 fold change, while several genes with significant differential expression appeared as red points above or below the midline. This pattern is expected, showing that while most genes remain stable, a subset responds strongly to dexamethasone treatment. .

Section 5a: Selecting genes for Gene expression

# Gene expression: create Data frames for Top 10 Up and Top 10 Down Genes

# Order genes by adjusted p-value
resOrdered <- res[order(res$padj), ]
head(resOrdered, 20)

## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 20 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
##                 <numeric>      <numeric> <numeric> <numeric>    <numeric>
## ENSG00000152583   997.445        4.60255 0.2117419   21.7366 9.24586e-105
## ENSG00000148175 11193.723        1.45149 0.0848958   17.0974  1.55286e-65
## ENSG00000179094   776.600        3.18388 0.2015094   15.8002  3.10285e-56
## ENSG00000134686  2737.982        1.38717 0.0916587   15.1341  9.65533e-52
## ENSG00000125148  3656.267        2.20347 0.1474191   14.9470  1.63015e-50
## ...                   ...            ...       ...       ...          ...
## ENSG00000166741   7432.96        2.16222 0.1775146   12.1805  3.94760e-34
## ENSG00000183044    498.69        1.15807 0.0971415   11.9215  9.14677e-33
## ENSG00000144369   1303.26       -1.35029 0.1161490  -11.6255  3.05683e-31
## ENSG00000198624   2057.20        2.89495 0.2506388   11.5503  7.35848e-31
## ENSG00000164125   5596.35        1.12456 0.0981153   11.4616  2.05624e-30
##                         padj
##                    <numeric>
## ENSG00000152583 1.70262e-100
## ENSG00000148175  1.42980e-61
## ENSG00000179094  1.90463e-52
## ENSG00000134686  4.44507e-48
## ENSG00000125148  6.00384e-47
## ...                      ...
## ENSG00000166741  4.54344e-31
## ENSG00000183044  9.90811e-30
## ENSG00000144369  3.12731e-28
## ENSG00000198624  7.13192e-28
## ENSG00000164125  1.89328e-27

# Select Up-regulated genes with log2FC > 0
upGenes <- subset(resOrdered, log2FoldChange > 0)
#top 10 up
topUp <- head(upGenes, 10)


# Select Down-regulated genes with log2FC < 0
downGenes <- subset(resOrdered, log2FoldChange < 0)
#top 10 down
topDown <- head(downGenes, 10)


# Save as dataframes
topUp_df <- as.data.frame(topUp)
topDown_df <- as.data.frame(topDown)

# View
topUp_df

##                   baseMean log2FoldChange      lfcSE     stat        pvalue
## ENSG00000152583   997.4447       4.602555 0.21174194 21.73662 9.245855e-105
## ENSG00000148175 11193.7232       1.451494 0.08489578 17.09736  1.552860e-65
## ENSG00000179094   776.6002       3.183883 0.20150941 15.80017  3.102848e-56
## ENSG00000134686  2737.9816       1.387168 0.09165870 15.13406  9.655331e-52
## ENSG00000125148  3656.2674       2.203468 0.14741913 14.94696  1.630150e-50
## ENSG00000120129  3409.0384       2.949013 0.20154644 14.63193  1.757388e-48
## ENSG00000189221  2341.7807       3.305227 0.22803072 14.49466  1.309529e-47
## ENSG00000109906   385.0713       7.176149 0.49635930 14.45757  2.245873e-47
## ENSG00000101347 12703.4128       3.857750 0.27932477 13.81099  2.188188e-43
## ENSG00000162614  5393.1145       2.029959 0.15170652 13.38083  7.826993e-41
##                          padj
## ENSG00000152583 1.702624e-100
## ENSG00000148175  1.429795e-61
## ENSG00000179094  1.904631e-52
## ENSG00000134686  4.445073e-48
## ENSG00000125148  6.003843e-47
## ENSG00000120129  5.393715e-45
## ENSG00000189221  3.444997e-44
## ENSG00000109906  5.169719e-44
## ENSG00000101347  4.029548e-40
## ENSG00000162614  1.310310e-37

topDown_df

##                    baseMean log2FoldChange      lfcSE      stat       pvalue
## ENSG00000178695   2649.8225      -2.499883 0.17890348 -13.97336 2.266696e-44
## ENSG00000196517    229.6380      -2.216939 0.16682841 -13.28874 2.690743e-40
## ENSG00000116584   2250.5809      -1.014492 0.08242501 -12.30807 8.196465e-35
## ENSG00000144369   1303.2632      -1.350294 0.11614901 -11.62553 3.056829e-31
## ENSG00000108821 100989.9160      -1.299471 0.11531121 -11.26925 1.861461e-29
## ENSG00000162692    508.1648      -3.609332 0.32642874 -11.05703 2.027005e-28
## ENSG00000181467    630.8696      -1.272452 0.11841431 -10.74576 6.204626e-27
## ENSG00000145777    152.3708      -2.786561 0.26171426 -10.64734 1.794184e-26
## ENSG00000163491    160.2233      -2.472566 0.23688626 -10.43778 1.666663e-25
## ENSG00000183160   2987.6473      -1.169122 0.11218874 -10.42103 1.987967e-25
##                         padj
## ENSG00000178695 4.637912e-41
## ENSG00000196517 4.129169e-37
## ENSG00000116584 1.006253e-31
## ENSG00000144369 3.127306e-28
## ENSG00000108821 1.632324e-26
## ENSG00000162692 1.555304e-25
## ENSG00000181467 3.939938e-24
## ENSG00000145777 1.032497e-23
## ENSG00000163491 8.769029e-23
## ENSG00000183160 1.016901e-22

Comment results here: The top 10 upregulated and top 10 downregulated genes were successfully identified based on adjusted p-values. The upregulated genes had positive log2 fold changes, indicating higher expression in treated samples, while the downregulated genes had negative fold changes, reflecting reduced expression after treatment. .

Section 5b: Boxplots of Top Genes (ordered, custom facet labels)

# Extract normalized expression values from VST object
vsd_mat <- assay(vsd)

# Function to build tidy dataframe for a set of genes
make_gene_df <- function(gene_list, label) {
  df <- as.data.frame(t(vsd_mat[rownames(vsd_mat) %in% gene_list, ]))
  df$sample <- rownames(df)
  df <- pivot_longer(df, cols = -sample, names_to = "gene", values_to = "expression")
  df <- merge(df, colData, by.x = "sample", by.y = "row.names")
  df$direction <- label
  # shorten gene labels to last 5 characters
  df$gene_short <- substr(df$gene, nchar(df$gene)-4, nchar(df$gene))
  # custom facet label: gene short ID + newline + direction
  df$facet_label <- paste0(df$gene_short, "\n", label)
  return(df)
}

# Collect top 10 upregulated and downregulated gene IDs
topUp_genes <- rownames(topUp_df)
topDown_genes <- rownames(topDown_df)

# Create tidy dataframes with custom labels
df_up <- make_gene_df(topUp_genes, "Upregulated")
df_down <- make_gene_df(topDown_genes, "Downregulated")

# Bind and enforce order: Upregulated first, then Downregulated
df_all <- rbind(df_up, df_down)
df_all$facet_label <- factor(df_all$facet_label, 
                             levels = c(unique(df_up$facet_label), unique(df_down$facet_label)))

# Plot: 4 per row
ggplot(df_all, aes(x = dexamethasone, y = expression, fill = dexamethasone)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~ facet_label, scales = "free_y", ncol = 4) +
  scale_fill_manual(values = c("untreated" = "skyblue", "treated" = "salmon")) +
  labs(title = "Expression of Top 10 Up- and Down-Regulated Genes",
       x = "Condition", y = "Normalized Expression (VST)") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        strip.text = element_text(face = "bold"))

Section 6 Create Volcano Plot to check adj-pvalue versus log2 fold change

resDF <- as.data.frame(res)    #already done for MA plot
resDF$gene <- rownames(res)     #already done for MA plot


volcanop <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(pvalue),
                       text = paste("ENSG: ", gene,
                                    "<br>log2FC:", round(log2FoldChange, 2),
                                    "<br>p-adj:", signif(padj, 3)))) +
  geom_point(aes(color = padj < 0.05), alpha=0.6) +
  scale_color_manual(values = c("grey", "red")) +
  theme_minimal() +
  xlim(c(-6,6)) +
  labs(title="Volcano Plot", x="Log2 Fold Change", y="-log10(p-value)")

volcanop

# Convert to interactive plot
#ggplotly(volcanop, tooltip = "text")

Comment rsults here: The volcano plot revealed a large number of significantly differentially expressed genes, with clear separation between upregulated and downregulated groups. Genes with both high log2 fold change and low p-values stood out in red, highlighting the most biologically relevant changes due to dexamethasone. .

Step 7: PCA Plot (ggplot version)

vsdata <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsdata, intgroup="dexamethasone", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=dexamethasone)) +
  geom_point(size=4, alpha=0.8) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal() +
  ggtitle("PCA of Samples")

# Create Interactive PCA
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = dexamethasone,
                         text = paste("Sample:", name,
                                      "<br>Condition:", dexamethasone))) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal()

#interactive
# ggplotly(p, tooltip = "text")

Comment results here: The PCA plot displayed a clear separation between treated and untreated samples along the first principal component. This indicates that dexamethasone treatment is the main source of variation in the dataset, confirming strong transcriptional effects and good experimental reproducibility. .

Section 8a: Heatmap of Top 10 Up and top 10 Down

# Add a column to mark direction

topUp_df$direction <- "Upregulated"
topDown_df$direction <- "Downregulated"

# Combine into one dataframe
topGenes_df <- rbind(topUp_df, topDown_df)

# Get top 20 genes by padj
topGenes_df <- head(rownames(topGenes_df), 20)

pheatmap(assay(vsdata)[topGenes_df, ],
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         annotation_col = colData,
         scale = "row",
         fontsize_row = 8,
         main = "Heatmap of Top 10 Up/Down DEGs")

Comment results here: Both heatmaps showed distinct clustering of treated versus untreated samples. The top differentially expressed genes had clear expression trends, with upregulated genes showing higher values in treated samples. This supports the findings from the DESeq2 analysis and visualizes consistent biological separation. .

Section 8b: Heatmap of Top 20 DEGs

# Get top 20 genes by padj
topGenes <- head(rownames(resOrdered), 20)

pheatmap(assay(vsdata)[topGenes, ],
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         annotation_col = colData,
         scale = "row",
         fontsize_row = 8,
         main = "Heatmap of Top 20 DEGs (adj-pv)")

Comment results here: The Ensembl gene IDs were successfully mapped to gene symbols and UniProt IDs using the biomaRt package. This step adds biological meaning to the results, allowing easier interpretation of which specific genes are affected by treatment. .

Section 9: Map ENSG IDs to Gene Names/UniPro, Gene Symbols

# Use biomaRt for ID conversion

mart <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")

# Convert IDs for top 10 up & top 10 down
geneIDs <- c(rownames(topUp_df), rownames(topDown_df))

annot <- getBM(attributes=c("ensembl_gene_id", 
                            "hgnc_symbol", 
                            "uniprot_gn_id", 
                            "refseq_mrna"),
               filters="ensembl_gene_id",
               values=geneIDs,
               mart=mart)

# Merge annotation with results
topUp_annot <- merge(topUp_df, annot, by.x="row.names", by.y="ensembl_gene_id")
topDown_annot <- merge(topDown_df, annot, by.x="row.names", by.y="ensembl_gene_id")

# Final annotated dataframes
#topUp_annot
#topDown_annot

# Merge annotation into res dataframe
resDF <- as.data.frame(res)
resDF$ensembl_gene_id <- rownames(resDF)

resDF_annot <- merge(resDF, annot, by="ensembl_gene_id", all.x=TRUE)

# Add a label column for top genes
highlightGenes <- c(rownames(topUp_df), rownames(topDown_df))
resDF_annot$label <- ifelse(resDF_annot$ensembl_gene_id %in% highlightGenes,
                            resDF_annot$hgnc_symbol, NA)

# ggplot volcano
vPlot <- ggplot(resDF_annot,
                      aes(x=log2FoldChange, y=-log10(pvalue),
                          text=paste0("ENSG: ", ensembl_gene_id,
                                      "<br>Gene: ", hgnc_symbol,
                                      "<br>UniProt: ", uniprot_gn_id,
                                      "<br>Log2FC: ", round(log2FoldChange,2),
                                      "<br>padj: ", signif(padj,3)))) +
  geom_jitter(aes(color = padj < 0.05), alpha=0.6) +
  scale_color_manual(values=c("grey","red")) +
  theme_minimal() +
  xlim(c(-6,6)) +
  labs(title="Interactive Volcano Plot",
       x="Log2 Fold Change", y="-log10(p-value)") +
  geom_text(aes(label=label), vjust=-1, size=3, na.rm=TRUE)

vPlot

# Convert to interactive with Plotly
# ggplotly(vPlot, tooltip="text")

Comment results here: The GO enrichment results identified several biological processes, molecular functions, and cellular components significantly enriched among differentially expressed genes. Many were related to stress response, inflammation, and transcription regulation, which align with the known effects of dexamethasone on airway cells. .

Section 10: Gene Ontology GO enrichment analysis

# Get significant DEGs (padj < 0.05)

sigGenes <- rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))]

# Remove version numbers
# Convert Ensembl IDs to Entrez IDs
sigGenes <- gsub("\\..*", "", 
                 rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))])

entrez <- bitr(sigGenes,
               fromType = "ENSEMBL",
               toType = "ENTREZID",
               OrgDb = org.Hs.eg.db)

head(entrez)

##           ENSEMBL ENTREZID
## 1 ENSG00000002834     3927
## 2 ENSG00000003096    90293
## 3 ENSG00000003402     8837
## 4 ENSG00000003987     9108
## 5 ENSG00000004059      381
## 6 ENSG00000004487    23028

Comment results here: The KEGG pathway analysis revealed enrichment in pathways associated with glucocorticoid response, cytokine signaling, and immune modulation. These results provide strong biological support for the treatment effects observed in the differential expression analysis. .

Section 10a: GO enrichment, BP biological process

#Take the significant DE genes and see what 
#biological processes / pathways they’re enriched in.

ego <- enrichGO(gene=entrez$ENTREZID,
                OrgDb=org.Hs.eg.db,
                keyType="ENTREZID",
                ont="BP",   # Biological Process
                pAdjustMethod="BH",
                pvalueCutoff=0.05,
                readable=TRUE)

# Dotplot
dotplot(ego, showCategory=20) + ggtitle("GO Biological Process Enrichment")

Comment results here: The GSEA analysis showed enrichment of gene sets related to inflammatory and stress-response processes, confirming that dexamethasone regulates multiple coordinated pathways rather than isolated genes. The ridgeplot highlighted several upregulated biological pathways consistent with the drug’s mechanism of action. .

Section 10b: GO enrichment,Molecular Function (MF)

ego_mf <- enrichGO(gene          = entrez$ENTREZID,
                   OrgDb         = org.Hs.eg.db,
                   keyType       = "ENTREZID",
                   ont           = "MF",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.05,
                   readable      = TRUE)

dotplot(ego_mf, showCategory=20) + ggtitle("GO: Molecular Function")

Comment results here: The Reactome enrichment highlighted pathways involving immune system regulation, cytokine signaling, and transcriptional control. The dotplot and barplot visualization clearly show that these pathways are significantly enriched in treated samples, emphasizing the systemic transcriptional effects of dexamethasone. .

Section 10c: GO enrichment,Cellular Component (CC)

ego_cc <- enrichGO(gene          = entrez$ENTREZID,
                   OrgDb         = org.Hs.eg.db,
                   keyType       = "ENTREZID",
                   ont           = "CC",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.05,
                   readable      = TRUE)

dotplot(ego_cc, showCategory=20) + ggtitle("GO: Cellular Component")

Comment results here: The UpSet plot confirmed that there is no overlap between the top upregulated and downregulated gene sets, which is expected since these represent opposite regulatory responses. It helps confirm that the filtering and classification of genes worked properly. .

Section 11: KEGG Pathway Enrichment

#Check whether DE genes cluster in known metabolic or signaling pathways.

ekegg <- enrichKEGG(gene = entrez$ENTREZID,
                    organism = 'hsa',
                    pvalueCutoff = 0.05)

dotplot(ekegg, showCategory=15) + ggtitle("KEGG Pathway Enrichment")

Comment results here: The CSV file containing the top 10 upregulated and downregulated genes was generated successfully for further use in g:Profiler. These genes can now be analyzed to explore specific biological pathways and networks affected by dexamethasone treatment. .

Section 12: Gene Set Enrichment Analysis (GSEA)

#Instead of just significant genes, 
#rank all genes by log2FC and see if gene sets are enriched.

# Rank genes by log2FC
geneList <- res$log2FoldChange
names(geneList) <- gsub("\\..*", "", rownames(res))
geneList <- sort(geneList, decreasing=TRUE)

# Run GSEA
gseaRes <- gseGO(geneList=geneList,
                 OrgDb=org.Hs.eg.db,
                 ont="BP",
                 keyType="ENSEMBL",
                 pvalueCutoff=0.05)

ridgeplot(gseaRes) + ggtitle("GSEA of Biological Processes")

Comment results here: The top 50 genes by adjusted p-value were exported successfully. These genes represent the most statistically significant transcriptional responses, providing a focused set for external pathway and enrichment analysis using g:Profiler. .

Section 13: Reactome Pathway Enrichment Analysis

# Perform Reactome pathway enrichment
ereactome <- enrichPathway(gene          = entrez$ENTREZID,
                           organism      = "human",
                           pvalueCutoff  = 0.05,
                           readable      = TRUE)

# View top enriched Reactome pathways
#head(as.data.frame(ereactome))

# Dotplot of top enriched Reactome pathways
dotplot(ereactome, showCategory = 20) + 
  ggtitle("Reactome Pathway Enrichment")

# Barplot alternative
barplot(ereactome, showCategory = 20, title = "Top Reactome Pathways")

# Network visualization (if desired)
cnetplot(ereactome, showCategory = 10, foldChange = NULL) +
  ggtitle("Reactome Pathway - Gene Network")

# Save results to CSV for downstream analysis
write.csv(as.data.frame(ereactome), "Reactome_enrichment_results.csv", row.names = FALSE)

Comment results here: The Reactome pathway enrichment analysis revealed that several key biological pathways were significantly enriched among the differentially expressed genes. Many of these pathways were related to immune system regulation, cytokine signaling, and metabolic processes. The dotplot and barplot clearly showed which pathways were most affected, while the network visualization highlighted the strong relationships between genes and pathways. Overall, these findings confirm that dexamethasone influences multiple cellular signaling and immune response mechanisms. .

Section 14: upset Plot (alternative to Venn),

#If you have multiple contrasts, you can show overlap of significant DEGs.

# Example if you had multiple sets of DEGs: lik top_up, top_down, 
#show that there s no intersection etween them
listInput <- list(
  Up = rownames(topUp_df),
  Down = rownames(topDown_df)
)

upset(fromList(listInput), order.by = "freq")

Comment results here: The UpSet plot demonstrated that there was no overlap between the upregulated and downregulated gene groups, which is expected because these represent opposite directions of expression change. This confirms that the classification of genes into upregulated and downregulated sets was accurate. It also shows that the differential expression filtering worked properly, with each group containing distinct sets of genes responding to dexamethasone treatment. .

Section 15 Optional: Take the top 10 up and top 10 down to g:profller (G).

#First extract the Gene symols and the save as a csv

# Connect to Ensembl
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# Function to map Ensembl IDs to gene symbols
ensembl_to_symbol <- function(ensembl_ids) {
  mapping <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = mart
  )
  return(mapping)
}

#Add add column name "Ensembl_ID" to the first 
#column of topUp_df and first column of topDown_df


# Move rownames into a real column called Ensembl_ID
topUp_df$Ensembl_ID <- rownames(topUp_df)
rownames(topUp_df) <- NULL

topDown_df$Ensembl_ID <- rownames(topDown_df)
rownames(topDown_df) <- NULL

#Now both will have Ensembl_ID as the first column (you can reorder if you want):
topUp_df <- topUp_df[, c("Ensembl_ID", setdiff(names(topUp_df), "Ensembl_ID"))]
topDown_df <- topDown_df[, c("Ensembl_ID", setdiff(names(topDown_df), "Ensembl_ID"))]


# Extract Ensembl IDs from your dataframes
up_ids <- topUp_df$Ensembl_ID
down_ids <- topDown_df$Ensembl_ID

# Map IDs
up_map <- ensembl_to_symbol(up_ids)
down_map <- ensembl_to_symbol(down_ids)

# Add regulation status
up_map$Status <- "Upregulated"
down_map$Status <- "Downregulated"

# Keep only 2 columns (GeneSymbol, Status)
up_map <- up_map[, c("hgnc_symbol", "Status")]
down_map <- down_map[, c("hgnc_symbol", "Status")]

# Combine
combined_df <- rbind(up_map, down_map)

# Rename columns
colnames(combined_df) <- c("GeneSymbol", "Status")

# Save to CSV
write.csv(combined_df, "combined_genes.csv", row.names = FALSE)

Comment results here: The top 10 upregulated and downregulated genes were successfully mapped to their corresponding gene symbols and saved as a combined CSV file for external analysis in g:Profiler. This step helps connect the statistical results to biological meaning, allowing exploration of which pathways or biological functions these top genes participate in. The resulting dataset provides a clear and concise summary of the most significant transcriptional changes caused by dexamethasone. .

Section 16 Optional: Take the top 50 adjusted for pvalue, to g:profiler (G).

#First extract the Gene symbols and the save as a csv

# Order genes by adjusted p-value
top_50 <- res[order(res$padj), ]
top_50 <- top_50[1:50, ]


##
top_50$Ensembl_ID <- rownames(top_50)
rownames(top_50) <- NULL

# top50 will have Ensembl_ID as the first column (you can reorder if you want):
top_50 <- top_50[, c("Ensembl_ID", setdiff(names(top_50), "Ensembl_ID"))]



# Map Ensembl IDs to gene symbols
top_50$Gene_Symbol <- mapIds(
  org.Hs.eg.db,
  keys = top_50$Ensembl_ID,
  column = "SYMBOL",
  keytype = "ENSEMBL",
  multiVals = "first"
)

top_50$Status <- ifelse(top_50$log2FoldChange > 0, "Upregulated", "Downregulated")

# Keep only Ensembl_ID, Gene_Symbol, and adjusted p-value
out <- top_50[, c("Ensembl_ID", "Gene_Symbol", "log2FoldChange", "padj", "Status")]

# Write to CSV
write.csv(out, "top_50_genes.csv", row.names = FALSE)

Comment here: The top 50 genes with the lowest adjusted p-values were identified, annotated with gene symbols, and saved for further enrichment analysis. This focused list represents the most statistically significant and biologically relevant transcriptional changes. It serves as a valuable input for g:Profiler or similar databases to identify enriched pathways and gene networks affected by the treatment. The results provide a strong foundation for deeper functional interpretation of the dataset.

if (!requireNamespace(“BiocManager”, quietly = TRUE)) install.packages(“BiocManager”)

BiocManager::install(“airway”, force = TRUE)

Lab 6 Transcriptomics with DESeq

Abdullah Sajid Chattha

2025-010-09