We will use DESeq2 package in R to run Differential Gene Expression. The analysys will use smooth muscle cells to check the transcriptome of normal cells and cells treated with asthma medication (GEO: GSE52778). This R version is modified from the following Bioconductor vignette: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#standard-workflow

Load Libraries

#install.packages("htmltools")
library(htmltools)
#BiocManager::install("rnaseqGene")
library(rnaseqGene)
#install.packages("plotly")
library(plotly)
#BiocManager::install("clusterProfiler", force = TRUE)
library(clusterProfiler)
#BiocManager::install("org.Hs.eg.db", force = TRUE)
library(org.Hs.eg.db)
#BiocManager::install("DOSE", force = TRUE)
library(DOSE)
#BiocManager::install("biomaRt", force = TRUE)
library(biomaRt)
#BiocManager::install("DESeq2", force = TRUE)
library(DESeq2)
#install.packages("pheatmap")
library(pheatmap)
#BiocManager::install("ReactomePA", force = TRUE)
library(ReactomePA)
#BiocManager::install("enrichplot", force = TRUE)
library(enrichplot)
#BiocManager::install("airway", force = TRUE)
#install.packages("airway")
library(airway)
library(ggplot2)
library(UpSetR)
library(tidyverse)
install.packages("ggridges")

Load the data

#load the data
# Retrieve and prepare counts data file (number of read counts for each expressed gene) 
# Retrieve and prepare metadata (experimental design) file 
#Both files from airway package (part of bioconductor)

data(airway)
#check the package: contains info on the experiment
airway
## class: RangedSummarizedExperiment 
## dim: 63677 8 
## metadata(1): ''
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
##   ENSG00000273493
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
sample_info <- as.data.frame(colData(airway))
sample_info <- sample_info[,c(2,3)]
sample_info$dex <- gsub('trt', 'treated', sample_info$dex)
sample_info$dex <- gsub('untrt', 'untreated', sample_info$dex)
names(sample_info) <- c('cellLine', 'dexamethasone')
write.table(sample_info, file = "sample_info.csv", sep = ',', col.names = T, 
            row.names = T, quote = F)
countsData <- assay(airway)
write.table(countsData, file = "counts_data.csv", sep = ',', col.names = T, 
            row.names = T, quote = F)

Comment results here: .

Next: Perform DGE Differential Gene expression analysis using DESeq2 package

Section 1: load the files: counts matrix and meta data

# load the matrix that contains counts data
# check counts data
counts_data <- read.csv('counts_data.csv')
head(counts_data) # shows the first 6 rows
##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003        679        448        873        408       1138
## ENSG00000000005          0          0          0          0          0
## ENSG00000000419        467        515        621        365        587
## ENSG00000000457        260        211        263        164        245
## ENSG00000000460         60         55         40         35         78
## ENSG00000000938          0          0          2          0          1
##                 SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003       1047        770        572
## ENSG00000000005          0          0          0
## ENSG00000000419        799        417        508
## ENSG00000000457        331        233        229
## ENSG00000000460         63         76         60
## ENSG00000000938          0          0          0
dim(counts_data)  # shows the dimensions of the matrix
## [1] 63677     8
# load and read in the Meta Data (experimental design) = sample info
# check the file
colData <- read.csv('sample_info.csv')
colData  # shows the first 6 rows
##            cellLine dexamethasone
## SRR1039508   N61311     untreated
## SRR1039509   N61311       treated
## SRR1039512  N052611     untreated
## SRR1039513  N052611       treated
## SRR1039516  N080611     untreated
## SRR1039517  N080611       treated
## SRR1039520  N061011     untreated
## SRR1039521  N061011       treated
dim(colData) # shows the dimensions of the table
## [1] 8 2
# Make sure the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))
## [1] TRUE
# Make sure everything is in the same order
all(colnames(counts_data) == rownames(colData))
## [1] TRUE
# Histogram to check the distribution of the count number in counts_data
# Convert all numeric columns into one vector to histogram

all_counts <- unlist(counts_data[ , sapply(counts_data, is.numeric)])

# Plot the histogram
hist(all_counts,
     breaks = 1000,  # adjust bins if needed
     col = "steelblue",
     main = "Histogram of All Counts",
     xlab = "Counts",
     xlim = c(0,4000), # adjust for outliers
     ylab = "Frequency")

Comment results here: Histogram - the most abundant count numbers is between 0 and 500 .

Section 2: apply DESeq2. Construct a DESeqDataSet dds object

dds <- DESeqDataSetFromMatrix(countData = counts_data,
                              colData = colData,
                              design = ~ dexamethasone)

#check dds, dim = dimension, same as counts data.
dds
## class: DESeqDataSet 
## dim: 63677 8 
## metadata(1): version
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
##   ENSG00000273493
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Optional: Pre-filtering: removing rows with low gene counts
# keeping only rows (genes) that show at least 10 reads/counts total

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# Check dds, should have a lower dim now
dds
## class: DESeqDataSet 
## dim: 22369 8 
## metadata(1): version
## assays(1): counts
## rownames(22369): ENSG00000000003 ENSG00000000419 ... ENSG00000273487
##   ENSG00000273488
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Set the factor level for comparison
dds$dexamethasone <- relevel(dds$dexamethasone, ref = "untreated")


# Variance-stabilizing transformation (like log-scale normalization)
# This lets you check if one sample has very different expression distribution
# meaning → potential outlier.

vsd <- vst(dds, blind = FALSE)

# Boxplot of normalized counts per sample
boxplot(assay(vsd),
        col = "lightblue",
        las = 2,
        main = "Boxplot of Normalized Counts (VST)",
        ylab = "Normalized expression")

Comment results here: The normalized expression is equal for each of the samples which is what we want to see - we want to see that we can now compare biological variation .

Section 3: Run DESeq

dds <- DESeq(dds)

# Extract the results table
res <- results(dds)

# Check the results table log2-fold change, pvalue, adj-pvalue, for each gene
head(res)
## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  708.5979     -0.3788229  0.173155 -2.187769 0.0286865
## ENSG00000000419  520.2963      0.2037893  0.100742  2.022878 0.0430857
## ENSG00000000457  237.1621      0.0340631  0.126476  0.269325 0.7876795
## ENSG00000000460   57.9324     -0.1171564  0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108      0.4409793  0.258776  1.704099 0.0883626
## ENSG00000001036 1282.1007     -0.2419097  0.119713 -2.020751 0.0433055
##                      padj
##                 <numeric>
## ENSG00000000003  0.138470
## ENSG00000000419  0.182998
## ENSG00000000457  0.929805
## ENSG00000000460  0.894231
## ENSG00000000971  0.297042
## ENSG00000001036  0.183498
# Explore the results table
summary(res)
## 
## out of 22369 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1884, 8.4%
## LFC < 0 (down)     : 1502, 6.7%
## outliers [1]       : 51, 0.23%
## low counts [2]     : 3903, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Explore the results table with adj-pvalue < 0.05
res0.05 <- results(dds, alpha = 0.05)
summary(res0.05)
## 
## out of 22369 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1555, 7%
## LFC < 0 (down)     : 1158, 5.2%
## outliers [1]       : 51, 0.23%
## low counts [2]     : 4336, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
class(res)
## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
head(res0.05)
## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  708.5979     -0.3788229  0.173155 -2.187769 0.0286865
## ENSG00000000419  520.2963      0.2037893  0.100742  2.022878 0.0430857
## ENSG00000000457  237.1621      0.0340631  0.126476  0.269325 0.7876795
## ENSG00000000460   57.9324     -0.1171564  0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108      0.4409793  0.258776  1.704099 0.0883626
## ENSG00000001036 1282.1007     -0.2419097  0.119713 -2.020751 0.0433055
##                      padj
##                 <numeric>
## ENSG00000000003  0.135712
## ENSG00000000419  0.179481
## ENSG00000000457  0.927740
## ENSG00000000460  0.890936
## ENSG00000000971  0.292084
## ENSG00000001036  0.179969

Comment results here: This shows the adjusted p values for the samples - we see that none of the samples are significant .

Step 4 MA Plot

# Create MA plot to check expression mean for each gene, versus log2-fold change
DESeq2::plotMA(res, ylim=c(-5,5))

# Create interactive MA plot
# Convert DESeq2 results to a data frame
resDF <- as.data.frame(res)

# Add gene names/IDs if available (rownames are gene IDs)
resDF$gene <- rownames(resDF)

# Create ggplot MA plot
pMA <- ggplot(resDF, aes(x=baseMean, y=log2FoldChange,
                       text = paste("ENSG: ", gene,
                                    "<br>baseMean:", round(baseMean, 2),
                                    "<br>log2FC:", round(log2FoldChange, 2),
                                    "<br>padj:", signif(padj, 3)))) +
  geom_point(aes(color = padj < 0.05), alpha=0.6) +
  scale_x_log10() +
  scale_color_manual(values = c("grey", "red")) +
  labs(title="MA Plot", x="Mean Expression (baseMean)", y="Log2 Fold Change") +
  theme_minimal()

# Convert ggplot to interactive plotly plot
# ggplotly(pMA, tooltip="text")

Comment results here: For the MA plot, we want to focus on the normalized counts that are further to the right of the graph - away from the low value of the mean normalized counts - anything about the gray area (0) is up regulated and anything below is down regulated .

Section 5a: Selecting genes for Gene expression

# Gene expression: create Data frames for Top 10 Up and Top 10 Down Genes

# Order genes by adjusted p-value
resOrdered <- res[order(res$padj), ]
head(resOrdered, 20)
## log2 fold change (MLE): dexamethasone treated vs untreated 
## Wald test p-value: dexamethasone treated vs untreated 
## DataFrame with 20 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
##                 <numeric>      <numeric> <numeric> <numeric>    <numeric>
## ENSG00000152583   997.445        4.60255 0.2117419   21.7366 9.24586e-105
## ENSG00000148175 11193.723        1.45149 0.0848958   17.0974  1.55286e-65
## ENSG00000179094   776.600        3.18388 0.2015094   15.8002  3.10285e-56
## ENSG00000134686  2737.982        1.38717 0.0916587   15.1341  9.65533e-52
## ENSG00000125148  3656.267        2.20347 0.1474191   14.9470  1.63015e-50
## ...                   ...            ...       ...       ...          ...
## ENSG00000166741   7432.96        2.16222 0.1775146   12.1805  3.94760e-34
## ENSG00000183044    498.69        1.15807 0.0971415   11.9215  9.14677e-33
## ENSG00000144369   1303.26       -1.35029 0.1161490  -11.6255  3.05683e-31
## ENSG00000198624   2057.20        2.89495 0.2506388   11.5503  7.35848e-31
## ENSG00000164125   5596.35        1.12456 0.0981153   11.4616  2.05624e-30
##                         padj
##                    <numeric>
## ENSG00000152583 1.70262e-100
## ENSG00000148175  1.42980e-61
## ENSG00000179094  1.90463e-52
## ENSG00000134686  4.44507e-48
## ENSG00000125148  6.00384e-47
## ...                      ...
## ENSG00000166741  4.54344e-31
## ENSG00000183044  9.90811e-30
## ENSG00000144369  3.12731e-28
## ENSG00000198624  7.13192e-28
## ENSG00000164125  1.89328e-27
# Select Up-regulated genes with log2FC > 0
upGenes <- subset(resOrdered, log2FoldChange > 0)
#top 10 up
topUp <- head(upGenes, 10)


# Select Down-regulated genes with log2FC < 0
downGenes <- subset(resOrdered, log2FoldChange < 0)
#top 10 down
topDown <- head(downGenes, 10)


# Save as dataframes
topUp_df <- as.data.frame(topUp)
topDown_df <- as.data.frame(topDown)

# View
topUp_df
##                   baseMean log2FoldChange      lfcSE     stat        pvalue
## ENSG00000152583   997.4447       4.602555 0.21174194 21.73662 9.245855e-105
## ENSG00000148175 11193.7232       1.451494 0.08489578 17.09736  1.552860e-65
## ENSG00000179094   776.6002       3.183883 0.20150941 15.80017  3.102848e-56
## ENSG00000134686  2737.9816       1.387168 0.09165870 15.13406  9.655331e-52
## ENSG00000125148  3656.2674       2.203468 0.14741913 14.94696  1.630150e-50
## ENSG00000120129  3409.0384       2.949013 0.20154644 14.63193  1.757388e-48
## ENSG00000189221  2341.7807       3.305227 0.22803072 14.49466  1.309529e-47
## ENSG00000109906   385.0713       7.176149 0.49635930 14.45757  2.245873e-47
## ENSG00000101347 12703.4128       3.857750 0.27932477 13.81099  2.188188e-43
## ENSG00000162614  5393.1145       2.029959 0.15170652 13.38083  7.826993e-41
##                          padj
## ENSG00000152583 1.702624e-100
## ENSG00000148175  1.429795e-61
## ENSG00000179094  1.904631e-52
## ENSG00000134686  4.445073e-48
## ENSG00000125148  6.003843e-47
## ENSG00000120129  5.393715e-45
## ENSG00000189221  3.444997e-44
## ENSG00000109906  5.169719e-44
## ENSG00000101347  4.029548e-40
## ENSG00000162614  1.310310e-37
topDown_df
##                    baseMean log2FoldChange      lfcSE      stat       pvalue
## ENSG00000178695   2649.8225      -2.499883 0.17890348 -13.97336 2.266696e-44
## ENSG00000196517    229.6380      -2.216939 0.16682841 -13.28874 2.690743e-40
## ENSG00000116584   2250.5809      -1.014492 0.08242501 -12.30807 8.196465e-35
## ENSG00000144369   1303.2632      -1.350294 0.11614901 -11.62553 3.056829e-31
## ENSG00000108821 100989.9160      -1.299471 0.11531121 -11.26925 1.861461e-29
## ENSG00000162692    508.1648      -3.609332 0.32642874 -11.05703 2.027005e-28
## ENSG00000181467    630.8696      -1.272452 0.11841431 -10.74576 6.204626e-27
## ENSG00000145777    152.3708      -2.786561 0.26171426 -10.64734 1.794184e-26
## ENSG00000163491    160.2233      -2.472566 0.23688626 -10.43778 1.666663e-25
## ENSG00000183160   2987.6473      -1.169122 0.11218874 -10.42103 1.987967e-25
##                         padj
## ENSG00000178695 4.637912e-41
## ENSG00000196517 4.129169e-37
## ENSG00000116584 1.006253e-31
## ENSG00000144369 3.127306e-28
## ENSG00000108821 1.632324e-26
## ENSG00000162692 1.555304e-25
## ENSG00000181467 3.939938e-24
## ENSG00000145777 1.032497e-23
## ENSG00000163491 8.769029e-23
## ENSG00000183160 1.016901e-22

Comment results here: This shows us the adjusted p value and we see that all 10 samples are significant. .

Section 5b: Boxplots of Top Genes (ordered, custom facet labels)

# Extract normalized expression values from VST object
vsd_mat <- assay(vsd)

# Function to build tidy dataframe for a set of genes
make_gene_df <- function(gene_list, label) {
  df <- as.data.frame(t(vsd_mat[rownames(vsd_mat) %in% gene_list, ]))
  df$sample <- rownames(df)
  df <- pivot_longer(df, cols = -sample, names_to = "gene", values_to = "expression")
  df <- merge(df, colData, by.x = "sample", by.y = "row.names")
  df$direction <- label
  # shorten gene labels to last 5 characters
  df$gene_short <- substr(df$gene, nchar(df$gene)-4, nchar(df$gene))
  # custom facet label: gene short ID + newline + direction
  df$facet_label <- paste0(df$gene_short, "\n", label)
  return(df)
}

# Collect top 10 upregulated and downregulated gene IDs
topUp_genes <- rownames(topUp_df)
topDown_genes <- rownames(topDown_df)

# Create tidy dataframes with custom labels
df_up <- make_gene_df(topUp_genes, "Upregulated")
df_down <- make_gene_df(topDown_genes, "Downregulated")

# Bind and enforce order: Upregulated first, then Downregulated
df_all <- rbind(df_up, df_down)
df_all$facet_label <- factor(df_all$facet_label, 
                             levels = c(unique(df_up$facet_label), unique(df_down$facet_label)))

# Plot: 4 per row
ggplot(df_all, aes(x = dexamethasone, y = expression, fill = dexamethasone)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~ facet_label, scales = "free_y", ncol = 4) +
  scale_fill_manual(values = c("untreated" = "skyblue", "treated" = "salmon")) +
  labs(title = "Expression of Top 10 Up- and Down-Regulated Genes",
       x = "Condition", y = "Normalized Expression (VST)") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        strip.text = element_text(face = "bold"))

Comment results here: These graphs show us what samples had up regulation and down regulation for the treated and untreated samples .

Section 6 Create Volcano Plot to check adj-pvalue versus log2 fold change

resDF <- as.data.frame(res)    #already done for MA plot
resDF$gene <- rownames(res)     #already done for MA plot


volcanop <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(pvalue),
                       text = paste("ENSG: ", gene,
                                    "<br>log2FC:", round(log2FoldChange, 2),
                                    "<br>p-adj:", signif(padj, 3)))) +
  geom_point(aes(color = padj < 0.05), alpha=0.6) +
  scale_color_manual(values = c("grey", "red")) +
  theme_minimal() +
  xlim(c(-6,6)) +
  labs(title="Volcano Plot", x="Log2 Fold Change", y="-log10(p-value)")

volcanop

# Convert to interactive plot
#ggplotly(volcanop, tooltip = "text")

Comment results here: This is a volcano plot and the gray areas are not significant but the dots in red are significant. Anything to the right of 0 is up regulated and anything to the left of 0 is down regulated .

Step 7: PCA Plot (ggplot version)

vsdata <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsdata, intgroup="dexamethasone", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=dexamethasone)) +
  geom_point(size=4, alpha=0.8) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal() +
  ggtitle("PCA of Samples")

# Create Interactive PCA
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = dexamethasone,
                         text = paste("Sample:", name,
                                      "<br>Condition:", dexamethasone))) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal()

#interactive
# ggplotly(p, tooltip = "text")

Comment results here: We are looking here to see if the untreated samples are clustered together and the treated samples are also clustered – but we want to make sure that the treated and untreated are not clustered together - in this graph, we see that each treatment is clustered together which tells us that there is possibly something biological that should be investigated .

Section 8a: Heatmap of Top 10 Up and top 10 Down

# Add a column to mark direction

topUp_df$direction <- "Upregulated"
topDown_df$direction <- "Downregulated"

# Combine into one dataframe
topGenes_df <- rbind(topUp_df, topDown_df)

# Get top 20 genes by padj
topGenes_df <- head(rownames(topGenes_df), 20)

pheatmap(assay(vsdata)[topGenes_df, ],
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         annotation_col = colData,
         scale = "row",
         fontsize_row = 8,
         main = "Heatmap of Top 10 Up/Down DEGs")

Comment results here: This heatmap shows the top 10 samples for up regulation and the top 10 samples for down regulation .

Section 8b: Heatmap of Top 20 DEGs

# Get top 20 genes by padj
topGenes <- head(rownames(resOrdered), 20)

pheatmap(assay(vsdata)[topGenes, ],
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         show_rownames = TRUE,
         annotation_col = colData,
         scale = "row",
         fontsize_row = 8,
         main = "Heatmap of Top 20 DEGs (adj-pv)")

Comment results here: This heatmap shows the top 20 samples for either up regulation or down regulation. We see that for the untreated, the top four are up regulated while it is the opposite for the treated group. .

Section 9: Map ENSG IDs to Gene Names/UniPro, Gene Symbols

# Use biomaRt for ID conversion

mart <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")

# Convert IDs for top 10 up & top 10 down
geneIDs <- c(rownames(topUp_df), rownames(topDown_df))

annot <- getBM(attributes=c("ensembl_gene_id", 
                            "hgnc_symbol", 
                            "uniprot_gn_id", 
                            "refseq_mrna"),
               filters="ensembl_gene_id",
               values=geneIDs,
               mart=mart)

# Merge annotation with results
topUp_annot <- merge(topUp_df, annot, by.x="row.names", by.y="ensembl_gene_id")
topDown_annot <- merge(topDown_df, annot, by.x="row.names", by.y="ensembl_gene_id")

# Final annotated dataframes
#topUp_annot
#topDown_annot

# Merge annotation into res dataframe
resDF <- as.data.frame(res)
resDF$ensembl_gene_id <- rownames(resDF)

resDF_annot <- merge(resDF, annot, by="ensembl_gene_id", all.x=TRUE)

# Add a label column for top genes
highlightGenes <- c(rownames(topUp_df), rownames(topDown_df))
resDF_annot$label <- ifelse(resDF_annot$ensembl_gene_id %in% highlightGenes,
                            resDF_annot$hgnc_symbol, NA)

# ggplot volcano
vPlot <- ggplot(resDF_annot,
                      aes(x=log2FoldChange, y=-log10(pvalue),
                          text=paste0("ENSG: ", ensembl_gene_id,
                                      "<br>Gene: ", hgnc_symbol,
                                      "<br>UniProt: ", uniprot_gn_id,
                                      "<br>Log2FC: ", round(log2FoldChange,2),
                                      "<br>padj: ", signif(padj,3)))) +
  geom_jitter(aes(color = padj < 0.05), alpha=0.6) +
  scale_color_manual(values=c("grey","red")) +
  theme_minimal() +
  xlim(c(-6,6)) +
  labs(title="Interactive Volcano Plot",
       x="Log2 Fold Change", y="-log10(p-value)") +
  geom_text(aes(label=label), vjust=-1, size=3, na.rm=TRUE)

vPlot

# Convert to interactive with Plotly
# ggplotly(vPlot, tooltip="text")

Comment results here: This is an interactive volcano plot so you can identify the exact data for each dot by hovering over it. Again we want to focus on the red dots which are significant. Anything to the right is up regulated and anything to the left is down regulated - KCTD12 is down regulated and STOM is up regulated .

Section 10: Gene Ontology GO enrichment analysis

# Get significant DEGs (padj < 0.05)

sigGenes <- rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))]

# Remove version numbers
# Convert Ensembl IDs to Entrez IDs
sigGenes <- gsub("\\..*", "", 
                 rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))])

entrez <- bitr(sigGenes,
               fromType = "ENSEMBL",
               toType = "ENTREZID",
               OrgDb = org.Hs.eg.db)

head(entrez)
##           ENSEMBL ENTREZID
## 1 ENSG00000002834     3927
## 2 ENSG00000003096    90293
## 3 ENSG00000003402     8837
## 4 ENSG00000003987     9108
## 5 ENSG00000004059      381
## 6 ENSG00000004487    23028

Comment results here: This converts identification from ENSEMBL to ENTREZID .

Section 10a: GO enrichment, BP biological process

#Take the significant DE genes and see what 
#biological processes / pathways they’re enriched in.

ego <- enrichGO(gene=entrez$ENTREZID,
                OrgDb=org.Hs.eg.db,
                keyType="ENTREZID",
                ont="BP",   # Biological Process
                pAdjustMethod="BH",
                pvalueCutoff=0.05,
                readable=TRUE)

# Dotplot
dotplot(ego, showCategory=20) + ggtitle("GO Biological Process Enrichment")

Comment results here: This shows us the ratio of the genes to their biological processes. The processes at the top have a higher amount of the genes participating in that process. Biological process describes pathways and larger biological objectives that genes contribute to - we see that this gene has a high ratio for actin filament organization

Section 10b: GO enrichment,Molecular Function (MF)

ego_mf <- enrichGO(gene          = entrez$ENTREZID,
                   OrgDb         = org.Hs.eg.db,
                   keyType       = "ENTREZID",
                   ont           = "MF",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.05,
                   readable      = TRUE)

dotplot(ego_mf, showCategory=20) + ggtitle("GO: Molecular Function")

Comment results here: This shows the gene ratio compared to the molecular function - what is the function of the gene and how many of them are involved in the processes - molecular function defines the biochemical activities gene produces such as enzyme activity - we see this gene functions with DNA-binding transcription .

Section 10c: GO enrichment,Cellular Component (CC)

ego_cc <- enrichGO(gene          = entrez$ENTREZID,
                   OrgDb         = org.Hs.eg.db,
                   keyType       = "ENTREZID",
                   ont           = "CC",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.05,
                   readable      = TRUE)

dotplot(ego_cc, showCategory=20) + ggtitle("GO: Cellular Component")

Comment results here: This shows the gene ratio compared to the cellular component involved. It shows where in the cell the gene is active and how many of them are involved with that part of the cell - cellular component specifies where gene products are active within the cell - where find the products of this gene are active at cell-substrate juntions .

Section 11: KEGG Pathway Enrichment

#Check whether DE genes cluster in known metabolic or signaling pathways.

ekegg <- enrichKEGG(gene = entrez$ENTREZID,
                    organism = 'hsa',
                    pvalueCutoff = 0.05)

dotplot(ekegg, showCategory=15) + ggtitle("KEGG Pathway Enrichment")

Comment results here: This shows the links between genomic information and higher-order functional pathways - this helps to identify which pathways are enriched or disrupted - we see an enrichment PI3K-Akt signaling pathway is enriched

Section 12: Gene Set Enrichment Analysis (GSEA)

#Instead of just significant genes, 
#rank all genes by log2FC and see if gene sets are enriched.

# Rank genes by log2FC
geneList <- res$log2FoldChange
names(geneList) <- gsub("\\..*", "", rownames(res))
geneList <- sort(geneList, decreasing=TRUE)

# Run GSEA
gseaRes <- gseGO(geneList=geneList,
                 OrgDb=org.Hs.eg.db,
                 ont="BP",
                 keyType="ENSEMBL",
                 pvalueCutoff=0.05)

ridgeplot(gseaRes) + ggtitle("GSEA of Biological Processes")

Comment results here: GSEA tests enrichment of a predefined gene set across conditions - this show biological pathways or functions that are consistently up regulated or down regulated - we see that temperature stimulus is consistently down regulated

Section 13: Reactome Pathway Enrichment Analysis

# Perform Reactome pathway enrichment
ereactome <- enrichPathway(gene          = entrez$ENTREZID,
                           organism      = "human",
                           pvalueCutoff  = 0.05,
                           readable      = TRUE)

# View top enriched Reactome pathways
#head(as.data.frame(ereactome))

# Dotplot of top enriched Reactome pathways
dotplot(ereactome, showCategory = 20) + 
  ggtitle("Reactome Pathway Enrichment")

# Barplot alternative
barplot(ereactome, showCategory = 20, title = "Top Reactome Pathways")

# Network visualization (if desired)
cnetplot(ereactome, showCategory = 10, foldChange = NULL) +
  ggtitle("Reactome Pathway - Gene Network")

# Save results to CSV for downstream analysis
write.csv(as.data.frame(ereactome), "Reactome_enrichment_results.csv", row.names = FALSE)

Comment results here: We see the top reactome pathway is the RHO GTPase pathway

Section 14: upset Plot (alternative to Venn),

#If you have multiple contrasts, you can show overlap of significant DEGs.

# Example if you had multiple sets of DEGs: lik top_up, top_down, 
#show that there s no intersection between them
listInput <- list(
  Up = rownames(topUp_df),
  Down = rownames(topDown_df)
)

upset(fromList(listInput), order.by = "freq")

Comment results here: .

Section 15 Optional: Take the top 10 up and top 10 down to g:profller (G).

#First extract the Gene symols and the save as a csv

# Connect to Ensembl
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# Function to map Ensembl IDs to gene symbols
ensembl_to_symbol <- function(ensembl_ids) {
  mapping <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = mart
  )
  return(mapping)
}

#Add add column name "Ensembl_ID" to the first 
#column of topUp_df and first column of topDown_df


# Move rownames into a real column called Ensembl_ID
topUp_df$Ensembl_ID <- rownames(topUp_df)
rownames(topUp_df) <- NULL

topDown_df$Ensembl_ID <- rownames(topDown_df)
rownames(topDown_df) <- NULL

#Now both will have Ensembl_ID as the first column (you can reorder if you want):
topUp_df <- topUp_df[, c("Ensembl_ID", setdiff(names(topUp_df), "Ensembl_ID"))]
topDown_df <- topDown_df[, c("Ensembl_ID", setdiff(names(topDown_df), "Ensembl_ID"))]


# Extract Ensembl IDs from your dataframes
up_ids <- topUp_df$Ensembl_ID
down_ids <- topDown_df$Ensembl_ID

# Map IDs
up_map <- ensembl_to_symbol(up_ids)
down_map <- ensembl_to_symbol(down_ids)

# Add regulation status
up_map$Status <- "Upregulated"
down_map$Status <- "Downregulated"

# Keep only 2 columns (GeneSymbol, Status)
up_map <- up_map[, c("hgnc_symbol", "Status")]
down_map <- down_map[, c("hgnc_symbol", "Status")]

# Combine
combined_df <- rbind(up_map, down_map)

# Rename columns
colnames(combined_df) <- c("GeneSymbol", "Status")

# Save to CSV
write.csv(combined_df, "combined_genes.csv", row.names = FALSE)

Comment results here: .

Section 16 Optional: Take the top 50 adjusted for pvalue, to g:profiler (G).

#First extract the Gene symbols and the save as a csv

# Order genes by adjusted p-value
top_50 <- res[order(res$padj), ]
top_50 <- top_50[1:50, ]


##
top_50$Ensembl_ID <- rownames(top_50)
rownames(top_50) <- NULL

# top50 will have Ensembl_ID as the first column (you can reorder if you want):
top_50 <- top_50[, c("Ensembl_ID", setdiff(names(top_50), "Ensembl_ID"))]



# Map Ensembl IDs to gene symbols
top_50$Gene_Symbol <- mapIds(
  org.Hs.eg.db,
  keys = top_50$Ensembl_ID,
  column = "SYMBOL",
  keytype = "ENSEMBL",
  multiVals = "first"
)

top_50$Status <- ifelse(top_50$log2FoldChange > 0, "Upregulated", "Downregulated")

# Keep only Ensembl_ID, Gene_Symbol, and adjusted p-value
out <- top_50[, c("Ensembl_ID", "Gene_Symbol", "log2FoldChange", "padj", "Status")]

# Write to CSV
write.csv(out, "top_50_genes.csv", row.names = FALSE)