We will use DESeq2 package in R to run Differential Gene Expression. The analysys will use smooth muscle cells to check the transcriptome of normal cells and cells treated with asthma medication (GEO: GSE52778). This R version is modified from the following Bioconductor vignette: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#standard-workflow
Load Libraries
#install.packages("htmltools")
library(htmltools)
#BiocManager::install("rnaseqGene")
library(rnaseqGene)
#install.packages("plotly")
library(plotly)
#BiocManager::install("clusterProfiler", force = TRUE)
library(clusterProfiler)
#BiocManager::install("org.Hs.eg.db", force = TRUE)
library(org.Hs.eg.db)
#BiocManager::install("DOSE", force = TRUE)
library(DOSE)
#BiocManager::install("biomaRt", force = TRUE)
library(biomaRt)
#BiocManager::install("DESeq2", force = TRUE)
library(DESeq2)
#install.packages("pheatmap")
library(pheatmap)
#BiocManager::install("ReactomePA", force = TRUE)
library(ReactomePA)
#BiocManager::install("enrichplot", force = TRUE)
library(enrichplot)
#BiocManager::install("airway", force = TRUE)
#install.packages("airway")
library(airway)
library(ggplot2)
library(UpSetR)
library(tidyverse)
Load the data
#load the data
# Retrieve and prepare counts data file (number of read counts for each expressed gene)
# Retrieve and prepare metadata (experimental design) file
#Both files from airway package (part of bioconductor)
data(airway)
#check the package: contains info on the experiment
airway
## class: RangedSummarizedExperiment
## dim: 63677 8
## metadata(1): ''
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
## ENSG00000273493
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
sample_info <- as.data.frame(colData(airway))
sample_info <- sample_info[,c(2,3)]
sample_info$dex <- gsub('trt', 'treated', sample_info$dex)
sample_info$dex <- gsub('untrt', 'untreated', sample_info$dex)
names(sample_info) <- c('cellLine', 'dexamethasone')
write.table(sample_info, file = "sample_info.csv", sep = ',', col.names = T,
row.names = T, quote = F)
countsData <- assay(airway)
write.table(countsData, file = "counts_data.csv", sep = ',', col.names = T,
row.names = T, quote = F)
Comment results here: two files are are counts_data.csv and the meta data us sample_info.csv .
# load the matrix that contains counts data
# check counts data
counts_data <- read.csv('counts_data.csv')
head(counts_data) # shows the first 6 rows
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
dim(counts_data) # shows the dimensions of the matrix
## [1] 63677 8
# load and read in the Meta Data (experimental design) = sample info
# check the file
colData <- read.csv('sample_info.csv')
colData # shows the first 6 rows
## cellLine dexamethasone
## SRR1039508 N61311 untreated
## SRR1039509 N61311 treated
## SRR1039512 N052611 untreated
## SRR1039513 N052611 treated
## SRR1039516 N080611 untreated
## SRR1039517 N080611 treated
## SRR1039520 N061011 untreated
## SRR1039521 N061011 treated
dim(colData) # shows the dimensions of the table
## [1] 8 2
# Make sure the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))
## [1] TRUE
# Make sure everything is in the same order
all(colnames(counts_data) == rownames(colData))
## [1] TRUE
# Histogram to check the distribution of the count number in counts_data
# Convert all numeric columns into one vector to histogram
all_counts <- unlist(counts_data[ , sapply(counts_data, is.numeric)])
# Plot the histogram
hist(all_counts,
breaks = 1000, # adjust bins if needed
col = "steelblue",
main = "Histogram of All Counts",
xlab = "Counts",
xlim = c(0,4000), # adjust for outliers
ylab = "Frequency")
Comment results here: The histogram shows good results and it shows most of the dana sequences are in between 500 counts. .
dds <- DESeqDataSetFromMatrix(countData = counts_data,
colData = colData,
design = ~ dexamethasone)
#check dds, dim = dimension, same as counts data.
dds
## class: DESeqDataSet
## dim: 63677 8
## metadata(1): version
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
## ENSG00000273493
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Optional: Pre-filtering: removing rows with low gene counts
# keeping only rows (genes) that show at least 10 reads/counts total
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
# Check dds, should have a lower dim now
dds
## class: DESeqDataSet
## dim: 22369 8
## metadata(1): version
## assays(1): counts
## rownames(22369): ENSG00000000003 ENSG00000000419 ... ENSG00000273487
## ENSG00000273488
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Set the factor level for comparison
dds$dexamethasone <- relevel(dds$dexamethasone, ref = "untreated")
# Variance-stabilizing transformation (like log-scale normalization)
# This lets you check if one sample has very different expression distribution
# meaning → potential outlier.
vsd <- vst(dds, blind = FALSE)
# Boxplot of normalized counts per sample
boxplot(assay(vsd),
col = "lightblue",
las = 2,
main = "Boxplot of Normalized Counts (VST)",
ylab = "Normalized expression")
Comment results here:The DESeqDataSet was created successfully after removing genes with very low counts. The boxplot of normalized counts showed consistent distributions across all samples after the variance-stabilizing transformation, suggesting that normalization worked well and there are no strong outliers between samples. .
dds <- DESeq(dds)
# Extract the results table
res <- results(dds)
# Check the results table log2-fold change, pvalue, adj-pvalue, for each gene
head(res)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.5979 -0.3788229 0.173155 -2.187769 0.0286865
## ENSG00000000419 520.2963 0.2037893 0.100742 2.022878 0.0430857
## ENSG00000000457 237.1621 0.0340631 0.126476 0.269325 0.7876795
## ENSG00000000460 57.9324 -0.1171564 0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108 0.4409793 0.258776 1.704099 0.0883626
## ENSG00000001036 1282.1007 -0.2419097 0.119713 -2.020751 0.0433055
## padj
## <numeric>
## ENSG00000000003 0.138470
## ENSG00000000419 0.182998
## ENSG00000000457 0.929805
## ENSG00000000460 0.894231
## ENSG00000000971 0.297042
## ENSG00000001036 0.183498
# Explore the results table
summary(res)
##
## out of 22369 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1884, 8.4%
## LFC < 0 (down) : 1502, 6.7%
## outliers [1] : 51, 0.23%
## low counts [2] : 3903, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Explore the results table with adj-pvalue < 0.05
res0.05 <- results(dds, alpha = 0.05)
summary(res0.05)
##
## out of 22369 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1555, 7%
## LFC < 0 (down) : 1158, 5.2%
## outliers [1] : 51, 0.23%
## low counts [2] : 4336, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
class(res)
## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
head(res0.05)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.5979 -0.3788229 0.173155 -2.187769 0.0286865
## ENSG00000000419 520.2963 0.2037893 0.100742 2.022878 0.0430857
## ENSG00000000457 237.1621 0.0340631 0.126476 0.269325 0.7876795
## ENSG00000000460 57.9324 -0.1171564 0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108 0.4409793 0.258776 1.704099 0.0883626
## ENSG00000001036 1282.1007 -0.2419097 0.119713 -2.020751 0.0433055
## padj
## <numeric>
## ENSG00000000003 0.135712
## ENSG00000000419 0.179481
## ENSG00000000457 0.927740
## ENSG00000000460 0.890936
## ENSG00000000971 0.292084
## ENSG00000001036 0.179969
Comment results here:The DESeq analysis completed successfully, producing log2 fold changes and adjusted p-values for each gene. The summary indicated a significant number of genes were differentially expressed at an adjusted p-value cutoff of 0.05. This confirms the treatment with dexamethasone had a measurable impact on gene expression. .
# Create MA plot to check expression mean for each gene, versus log2-fold change
DESeq2::plotMA(res, ylim=c(-5,5))
# Create interactive MA plot
# Convert DESeq2 results to a data frame
resDF <- as.data.frame(res)
# Add gene names/IDs if available (rownames are gene IDs)
resDF$gene <- rownames(resDF)
# Create ggplot MA plot
pMA <- ggplot(resDF, aes(x=baseMean, y=log2FoldChange,
text = paste("ENSG: ", gene,
"<br>baseMean:", round(baseMean, 2),
"<br>log2FC:", round(log2FoldChange, 2),
"<br>padj:", signif(padj, 3)))) +
geom_point(aes(color = padj < 0.05), alpha=0.6) +
scale_x_log10() +
scale_color_manual(values = c("grey", "red")) +
labs(title="MA Plot", x="Mean Expression (baseMean)", y="Log2 Fold Change") +
theme_minimal()
# Convert ggplot to interactive plotly plot
# ggplotly(pMA, tooltip="text")
Comment results here: The MA plot showed most genes centered around zero log2 fold change, while several genes with significant differential expression appeared as red points above or below the midline. This pattern is expected, showing that while most genes remain stable, a subset responds strongly to dexamethasone treatment. .
# Gene expression: create Data frames for Top 10 Up and Top 10 Down Genes
# Order genes by adjusted p-value
resOrdered <- res[order(res$padj), ]
head(resOrdered, 20)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 20 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000152583 997.445 4.60255 0.2117419 21.7366 9.24586e-105
## ENSG00000148175 11193.723 1.45149 0.0848958 17.0974 1.55286e-65
## ENSG00000179094 776.600 3.18388 0.2015094 15.8002 3.10285e-56
## ENSG00000134686 2737.982 1.38717 0.0916587 15.1341 9.65533e-52
## ENSG00000125148 3656.267 2.20347 0.1474191 14.9470 1.63015e-50
## ... ... ... ... ... ...
## ENSG00000166741 7432.96 2.16222 0.1775146 12.1805 3.94760e-34
## ENSG00000183044 498.69 1.15807 0.0971415 11.9215 9.14677e-33
## ENSG00000144369 1303.26 -1.35029 0.1161490 -11.6255 3.05683e-31
## ENSG00000198624 2057.20 2.89495 0.2506388 11.5503 7.35848e-31
## ENSG00000164125 5596.35 1.12456 0.0981153 11.4616 2.05624e-30
## padj
## <numeric>
## ENSG00000152583 1.70262e-100
## ENSG00000148175 1.42980e-61
## ENSG00000179094 1.90463e-52
## ENSG00000134686 4.44507e-48
## ENSG00000125148 6.00384e-47
## ... ...
## ENSG00000166741 4.54344e-31
## ENSG00000183044 9.90811e-30
## ENSG00000144369 3.12731e-28
## ENSG00000198624 7.13192e-28
## ENSG00000164125 1.89328e-27
# Select Up-regulated genes with log2FC > 0
upGenes <- subset(resOrdered, log2FoldChange > 0)
#top 10 up
topUp <- head(upGenes, 10)
# Select Down-regulated genes with log2FC < 0
downGenes <- subset(resOrdered, log2FoldChange < 0)
#top 10 down
topDown <- head(downGenes, 10)
# Save as dataframes
topUp_df <- as.data.frame(topUp)
topDown_df <- as.data.frame(topDown)
# View
topUp_df
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000152583 997.4447 4.602555 0.21174194 21.73662 9.245855e-105
## ENSG00000148175 11193.7232 1.451494 0.08489578 17.09736 1.552860e-65
## ENSG00000179094 776.6002 3.183883 0.20150941 15.80017 3.102848e-56
## ENSG00000134686 2737.9816 1.387168 0.09165870 15.13406 9.655331e-52
## ENSG00000125148 3656.2674 2.203468 0.14741913 14.94696 1.630150e-50
## ENSG00000120129 3409.0384 2.949013 0.20154644 14.63193 1.757388e-48
## ENSG00000189221 2341.7807 3.305227 0.22803072 14.49466 1.309529e-47
## ENSG00000109906 385.0713 7.176149 0.49635930 14.45757 2.245873e-47
## ENSG00000101347 12703.4128 3.857750 0.27932477 13.81099 2.188188e-43
## ENSG00000162614 5393.1145 2.029959 0.15170652 13.38083 7.826993e-41
## padj
## ENSG00000152583 1.702624e-100
## ENSG00000148175 1.429795e-61
## ENSG00000179094 1.904631e-52
## ENSG00000134686 4.445073e-48
## ENSG00000125148 6.003843e-47
## ENSG00000120129 5.393715e-45
## ENSG00000189221 3.444997e-44
## ENSG00000109906 5.169719e-44
## ENSG00000101347 4.029548e-40
## ENSG00000162614 1.310310e-37
topDown_df
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000178695 2649.8225 -2.499883 0.17890348 -13.97336 2.266696e-44
## ENSG00000196517 229.6380 -2.216939 0.16682841 -13.28874 2.690743e-40
## ENSG00000116584 2250.5809 -1.014492 0.08242501 -12.30807 8.196465e-35
## ENSG00000144369 1303.2632 -1.350294 0.11614901 -11.62553 3.056829e-31
## ENSG00000108821 100989.9160 -1.299471 0.11531121 -11.26925 1.861461e-29
## ENSG00000162692 508.1648 -3.609332 0.32642874 -11.05703 2.027005e-28
## ENSG00000181467 630.8696 -1.272452 0.11841431 -10.74576 6.204626e-27
## ENSG00000145777 152.3708 -2.786561 0.26171426 -10.64734 1.794184e-26
## ENSG00000163491 160.2233 -2.472566 0.23688626 -10.43778 1.666663e-25
## ENSG00000183160 2987.6473 -1.169122 0.11218874 -10.42103 1.987967e-25
## padj
## ENSG00000178695 4.637912e-41
## ENSG00000196517 4.129169e-37
## ENSG00000116584 1.006253e-31
## ENSG00000144369 3.127306e-28
## ENSG00000108821 1.632324e-26
## ENSG00000162692 1.555304e-25
## ENSG00000181467 3.939938e-24
## ENSG00000145777 1.032497e-23
## ENSG00000163491 8.769029e-23
## ENSG00000183160 1.016901e-22
Comment results here: The top 10 upregulated and top 10 downregulated genes were successfully identified based on adjusted p-values. The upregulated genes had positive log2 fold changes, indicating higher expression in treated samples, while the downregulated genes had negative fold changes, reflecting reduced expression after treatment. .
# Extract normalized expression values from VST object
vsd_mat <- assay(vsd)
# Function to build tidy dataframe for a set of genes
make_gene_df <- function(gene_list, label) {
df <- as.data.frame(t(vsd_mat[rownames(vsd_mat) %in% gene_list, ]))
df$sample <- rownames(df)
df <- pivot_longer(df, cols = -sample, names_to = "gene", values_to = "expression")
df <- merge(df, colData, by.x = "sample", by.y = "row.names")
df$direction <- label
# shorten gene labels to last 5 characters
df$gene_short <- substr(df$gene, nchar(df$gene)-4, nchar(df$gene))
# custom facet label: gene short ID + newline + direction
df$facet_label <- paste0(df$gene_short, "\n", label)
return(df)
}
# Collect top 10 upregulated and downregulated gene IDs
topUp_genes <- rownames(topUp_df)
topDown_genes <- rownames(topDown_df)
# Create tidy dataframes with custom labels
df_up <- make_gene_df(topUp_genes, "Upregulated")
df_down <- make_gene_df(topDown_genes, "Downregulated")
# Bind and enforce order: Upregulated first, then Downregulated
df_all <- rbind(df_up, df_down)
df_all$facet_label <- factor(df_all$facet_label,
levels = c(unique(df_up$facet_label), unique(df_down$facet_label)))
# Plot: 4 per row
ggplot(df_all, aes(x = dexamethasone, y = expression, fill = dexamethasone)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.5) +
facet_wrap(~ facet_label, scales = "free_y", ncol = 4) +
scale_fill_manual(values = c("untreated" = "skyblue", "treated" = "salmon")) +
labs(title = "Expression of Top 10 Up- and Down-Regulated Genes",
x = "Condition", y = "Normalized Expression (VST)") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(face = "bold"))
Comment results here: The top 10 upregulated and top 10 downregulated genes were successfully identified based on adjusted p-values. The upregulated genes had positive log2 fold changes, indicating higher expression in treated samples, while the downregulated genes had negative fold changes, reflecting reduced expression after treatment. .
resDF <- as.data.frame(res) #already done for MA plot
resDF$gene <- rownames(res) #already done for MA plot
volcanop <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(pvalue),
text = paste("ENSG: ", gene,
"<br>log2FC:", round(log2FoldChange, 2),
"<br>p-adj:", signif(padj, 3)))) +
geom_point(aes(color = padj < 0.05), alpha=0.6) +
scale_color_manual(values = c("grey", "red")) +
theme_minimal() +
xlim(c(-6,6)) +
labs(title="Volcano Plot", x="Log2 Fold Change", y="-log10(p-value)")
volcanop
# Convert to interactive plot
#ggplotly(volcanop, tooltip = "text")
Comment rsults here: The volcano plot revealed a large number of significantly differentially expressed genes, with clear separation between upregulated and downregulated groups. Genes with both high log2 fold change and low p-values stood out in red, highlighting the most biologically relevant changes due to dexamethasone. .
vsdata <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsdata, intgroup="dexamethasone", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=dexamethasone)) +
geom_point(size=4, alpha=0.8) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme_minimal() +
ggtitle("PCA of Samples")
# Create Interactive PCA
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = dexamethasone,
text = paste("Sample:", name,
"<br>Condition:", dexamethasone))) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme_minimal()
#interactive
# ggplotly(p, tooltip = "text")
Comment results here: The PCA plot displayed a clear separation between treated and untreated samples along the first principal component. This indicates that dexamethasone treatment is the main source of variation in the dataset, confirming strong transcriptional effects and good experimental reproducibility. .
# Add a column to mark direction
topUp_df$direction <- "Upregulated"
topDown_df$direction <- "Downregulated"
# Combine into one dataframe
topGenes_df <- rbind(topUp_df, topDown_df)
# Get top 20 genes by padj
topGenes_df <- head(rownames(topGenes_df), 20)
pheatmap(assay(vsdata)[topGenes_df, ],
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
annotation_col = colData,
scale = "row",
fontsize_row = 8,
main = "Heatmap of Top 10 Up/Down DEGs")
Comment results here: Both heatmaps showed distinct clustering of treated versus untreated samples. The top differentially expressed genes had clear expression trends, with upregulated genes showing higher values in treated samples. This supports the findings from the DESeq2 analysis and visualizes consistent biological separation. .
# Get top 20 genes by padj
topGenes <- head(rownames(resOrdered), 20)
pheatmap(assay(vsdata)[topGenes, ],
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
annotation_col = colData,
scale = "row",
fontsize_row = 8,
main = "Heatmap of Top 20 DEGs (adj-pv)")
Comment results here: The Ensembl gene IDs were successfully mapped to gene symbols and UniProt IDs using the biomaRt package. This step adds biological meaning to the results, allowing easier interpretation of which specific genes are affected by treatment. .
# Use biomaRt for ID conversion
mart <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
# Convert IDs for top 10 up & top 10 down
geneIDs <- c(rownames(topUp_df), rownames(topDown_df))
annot <- getBM(attributes=c("ensembl_gene_id",
"hgnc_symbol",
"uniprot_gn_id",
"refseq_mrna"),
filters="ensembl_gene_id",
values=geneIDs,
mart=mart)
# Merge annotation with results
topUp_annot <- merge(topUp_df, annot, by.x="row.names", by.y="ensembl_gene_id")
topDown_annot <- merge(topDown_df, annot, by.x="row.names", by.y="ensembl_gene_id")
# Final annotated dataframes
#topUp_annot
#topDown_annot
# Merge annotation into res dataframe
resDF <- as.data.frame(res)
resDF$ensembl_gene_id <- rownames(resDF)
resDF_annot <- merge(resDF, annot, by="ensembl_gene_id", all.x=TRUE)
# Add a label column for top genes
highlightGenes <- c(rownames(topUp_df), rownames(topDown_df))
resDF_annot$label <- ifelse(resDF_annot$ensembl_gene_id %in% highlightGenes,
resDF_annot$hgnc_symbol, NA)
# ggplot volcano
vPlot <- ggplot(resDF_annot,
aes(x=log2FoldChange, y=-log10(pvalue),
text=paste0("ENSG: ", ensembl_gene_id,
"<br>Gene: ", hgnc_symbol,
"<br>UniProt: ", uniprot_gn_id,
"<br>Log2FC: ", round(log2FoldChange,2),
"<br>padj: ", signif(padj,3)))) +
geom_jitter(aes(color = padj < 0.05), alpha=0.6) +
scale_color_manual(values=c("grey","red")) +
theme_minimal() +
xlim(c(-6,6)) +
labs(title="Interactive Volcano Plot",
x="Log2 Fold Change", y="-log10(p-value)") +
geom_text(aes(label=label), vjust=-1, size=3, na.rm=TRUE)
vPlot
# Convert to interactive with Plotly
# ggplotly(vPlot, tooltip="text")
Comment results here: The GO enrichment results identified several biological processes, molecular functions, and cellular components significantly enriched among differentially expressed genes. Many were related to stress response, inflammation, and transcription regulation, which align with the known effects of dexamethasone on airway cells. .
# Get significant DEGs (padj < 0.05)
sigGenes <- rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))]
# Remove version numbers
# Convert Ensembl IDs to Entrez IDs
sigGenes <- gsub("\\..*", "",
rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))])
entrez <- bitr(sigGenes,
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
head(entrez)
## ENSEMBL ENTREZID
## 1 ENSG00000002834 3927
## 2 ENSG00000003096 90293
## 3 ENSG00000003402 8837
## 4 ENSG00000003987 9108
## 5 ENSG00000004059 381
## 6 ENSG00000004487 23028
Comment results here: The KEGG pathway analysis revealed enrichment in pathways associated with glucocorticoid response, cytokine signaling, and immune modulation. These results provide strong biological support for the treatment effects observed in the differential expression analysis. .
#Take the significant DE genes and see what
#biological processes / pathways they’re enriched in.
ego <- enrichGO(gene=entrez$ENTREZID,
OrgDb=org.Hs.eg.db,
keyType="ENTREZID",
ont="BP", # Biological Process
pAdjustMethod="BH",
pvalueCutoff=0.05,
readable=TRUE)
# Dotplot
dotplot(ego, showCategory=20) + ggtitle("GO Biological Process Enrichment")
Comment results here: The GSEA analysis showed enrichment of gene sets related to inflammatory and stress-response processes, confirming that dexamethasone regulates multiple coordinated pathways rather than isolated genes. The ridgeplot highlighted several upregulated biological pathways consistent with the drug’s mechanism of action. .
ego_mf <- enrichGO(gene = entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
dotplot(ego_mf, showCategory=20) + ggtitle("GO: Molecular Function")
Comment results here: The Reactome enrichment highlighted pathways involving immune system regulation, cytokine signaling, and transcriptional control. The dotplot and barplot visualization clearly show that these pathways are significantly enriched in treated samples, emphasizing the systemic transcriptional effects of dexamethasone. .
ego_cc <- enrichGO(gene = entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
dotplot(ego_cc, showCategory=20) + ggtitle("GO: Cellular Component")
Comment results here: The UpSet plot confirmed that there is no overlap between the top upregulated and downregulated gene sets, which is expected since these represent opposite regulatory responses. It helps confirm that the filtering and classification of genes worked properly. .
#Check whether DE genes cluster in known metabolic or signaling pathways.
ekegg <- enrichKEGG(gene = entrez$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
dotplot(ekegg, showCategory=15) + ggtitle("KEGG Pathway Enrichment")
Comment results here: The CSV file containing the top 10 upregulated and downregulated genes was generated successfully for further use in g:Profiler. These genes can now be analyzed to explore specific biological pathways and networks affected by dexamethasone treatment. .
#Instead of just significant genes,
#rank all genes by log2FC and see if gene sets are enriched.
# Rank genes by log2FC
geneList <- res$log2FoldChange
names(geneList) <- gsub("\\..*", "", rownames(res))
geneList <- sort(geneList, decreasing=TRUE)
# Run GSEA
gseaRes <- gseGO(geneList=geneList,
OrgDb=org.Hs.eg.db,
ont="BP",
keyType="ENSEMBL",
pvalueCutoff=0.05)
ridgeplot(gseaRes) + ggtitle("GSEA of Biological Processes")
Comment results here: The top 50 genes by adjusted p-value were exported successfully. These genes represent the most statistically significant transcriptional responses, providing a focused set for external pathway and enrichment analysis using g:Profiler. .
# Perform Reactome pathway enrichment
ereactome <- enrichPathway(gene = entrez$ENTREZID,
organism = "human",
pvalueCutoff = 0.05,
readable = TRUE)
# View top enriched Reactome pathways
#head(as.data.frame(ereactome))
# Dotplot of top enriched Reactome pathways
dotplot(ereactome, showCategory = 20) +
ggtitle("Reactome Pathway Enrichment")
# Barplot alternative
barplot(ereactome, showCategory = 20, title = "Top Reactome Pathways")
# Network visualization (if desired)
cnetplot(ereactome, showCategory = 10, foldChange = NULL) +
ggtitle("Reactome Pathway - Gene Network")
# Save results to CSV for downstream analysis
write.csv(as.data.frame(ereactome), "Reactome_enrichment_results.csv", row.names = FALSE)
Comment results here: The Reactome pathway enrichment analysis revealed that several key biological pathways were significantly enriched among the differentially expressed genes. Many of these pathways were related to immune system regulation, cytokine signaling, and metabolic processes. The dotplot and barplot clearly showed which pathways were most affected, while the network visualization highlighted the strong relationships between genes and pathways. Overall, these findings confirm that dexamethasone influences multiple cellular signaling and immune response mechanisms. .
#If you have multiple contrasts, you can show overlap of significant DEGs.
# Example if you had multiple sets of DEGs: lik top_up, top_down,
#show that there s no intersection etween them
listInput <- list(
Up = rownames(topUp_df),
Down = rownames(topDown_df)
)
upset(fromList(listInput), order.by = "freq")
Comment results here: The UpSet plot demonstrated that there was no overlap between the upregulated and downregulated gene groups, which is expected because these represent opposite directions of expression change. This confirms that the classification of genes into upregulated and downregulated sets was accurate. It also shows that the differential expression filtering worked properly, with each group containing distinct sets of genes responding to dexamethasone treatment. .
#First extract the Gene symols and the save as a csv
# Connect to Ensembl
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Function to map Ensembl IDs to gene symbols
ensembl_to_symbol <- function(ensembl_ids) {
mapping <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = mart
)
return(mapping)
}
#Add add column name "Ensembl_ID" to the first
#column of topUp_df and first column of topDown_df
# Move rownames into a real column called Ensembl_ID
topUp_df$Ensembl_ID <- rownames(topUp_df)
rownames(topUp_df) <- NULL
topDown_df$Ensembl_ID <- rownames(topDown_df)
rownames(topDown_df) <- NULL
#Now both will have Ensembl_ID as the first column (you can reorder if you want):
topUp_df <- topUp_df[, c("Ensembl_ID", setdiff(names(topUp_df), "Ensembl_ID"))]
topDown_df <- topDown_df[, c("Ensembl_ID", setdiff(names(topDown_df), "Ensembl_ID"))]
# Extract Ensembl IDs from your dataframes
up_ids <- topUp_df$Ensembl_ID
down_ids <- topDown_df$Ensembl_ID
# Map IDs
up_map <- ensembl_to_symbol(up_ids)
down_map <- ensembl_to_symbol(down_ids)
# Add regulation status
up_map$Status <- "Upregulated"
down_map$Status <- "Downregulated"
# Keep only 2 columns (GeneSymbol, Status)
up_map <- up_map[, c("hgnc_symbol", "Status")]
down_map <- down_map[, c("hgnc_symbol", "Status")]
# Combine
combined_df <- rbind(up_map, down_map)
# Rename columns
colnames(combined_df) <- c("GeneSymbol", "Status")
# Save to CSV
write.csv(combined_df, "combined_genes.csv", row.names = FALSE)
Comment results here: The top 10 upregulated and downregulated genes were successfully mapped to their corresponding gene symbols and saved as a combined CSV file for external analysis in g:Profiler. This step helps connect the statistical results to biological meaning, allowing exploration of which pathways or biological functions these top genes participate in. The resulting dataset provides a clear and concise summary of the most significant transcriptional changes caused by dexamethasone. .
#First extract the Gene symbols and the save as a csv
# Order genes by adjusted p-value
top_50 <- res[order(res$padj), ]
top_50 <- top_50[1:50, ]
##
top_50$Ensembl_ID <- rownames(top_50)
rownames(top_50) <- NULL
# top50 will have Ensembl_ID as the first column (you can reorder if you want):
top_50 <- top_50[, c("Ensembl_ID", setdiff(names(top_50), "Ensembl_ID"))]
# Map Ensembl IDs to gene symbols
top_50$Gene_Symbol <- mapIds(
org.Hs.eg.db,
keys = top_50$Ensembl_ID,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first"
)
top_50$Status <- ifelse(top_50$log2FoldChange > 0, "Upregulated", "Downregulated")
# Keep only Ensembl_ID, Gene_Symbol, and adjusted p-value
out <- top_50[, c("Ensembl_ID", "Gene_Symbol", "log2FoldChange", "padj", "Status")]
# Write to CSV
write.csv(out, "top_50_genes.csv", row.names = FALSE)
Comment here: The top 50 genes with the lowest adjusted p-values were identified, annotated with gene symbols, and saved for further enrichment analysis. This focused list represents the most statistically significant and biologically relevant transcriptional changes. It serves as a valuable input for g:Profiler or similar databases to identify enriched pathways and gene networks affected by the treatment. The results provide a strong foundation for deeper functional interpretation of the dataset.
if (!requireNamespace(“BiocManager”, quietly = TRUE)) install.packages(“BiocManager”)
BiocManager::install(“airway”, force = TRUE)