We will use DESeq2 package in R to run Differential Gene Expression. The analysys will use smooth muscle cells to check the transcriptome of normal cells and cells treated with asthma medication (GEO: GSE52778). This R version is modified from the following Bioconductor vignette: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#standard-workflow
Load Libraries
#install.packages("htmltools")
library(htmltools)
#BiocManager::install("rnaseqGene")
library(rnaseqGene)
#install.packages("plotly")
library(plotly)
#BiocManager::install("clusterProfiler", force = TRUE)
library(clusterProfiler)
#BiocManager::install("org.Hs.eg.db", force = TRUE)
library(org.Hs.eg.db)
#BiocManager::install("DOSE", force = TRUE)
library(DOSE)
#BiocManager::install("biomaRt", force = TRUE)
library(biomaRt)
#BiocManager::install("DESeq2", force = TRUE)
library(DESeq2)
#install.packages("pheatmap")
library(pheatmap)
BiocManager::install("ReactomePA", force = TRUE)
##
## The downloaded binary packages are in
## /var/folders/gn/q1_95fp94sd24mvn322sd1_c0000gn/T//RtmpaKKw7K/downloaded_packages
library(ReactomePA)
#BiocManager::install("enrichplot", force = TRUE)
library(enrichplot)
#BiocManager::install("airway", force = TRUE)
#install.packages("airway")
library(airway)
library(ggplot2)
library(UpSetR)
library(tidyverse)
Load the data
#load the data
# Retrieve and prepare counts data file (number of read counts for each expressed gene)
# Retrieve and prepare metadata (experimental design) file
#Both files from airway package (part of bioconductor)
data(airway)
#check the package: contains info on the experiment
airway
## class: RangedSummarizedExperiment
## dim: 63677 8
## metadata(1): ''
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
## ENSG00000273493
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
sample_info <- as.data.frame(colData(airway))
sample_info <- sample_info[,c(2,3)]
sample_info$dex <- gsub('trt', 'treated', sample_info$dex)
sample_info$dex <- gsub('untrt', 'untreated', sample_info$dex)
names(sample_info) <- c('cellLine', 'dexamethasone')
write.table(sample_info, file = "sample_info.csv", sep = ',', col.names = T,
row.names = T, quote = F)
countsData <- assay(airway)
write.table(countsData, file = "counts_data.csv", sep = ',', col.names = T,
row.names = T, quote = F)
Comment results here: .
# load the matrix that contains counts data
# check counts data
counts_data <- read.csv('counts_data.csv')
head(counts_data) # shows the first 6 rows
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
dim(counts_data) # shows the dimensions of the matrix
## [1] 63677 8
# load and read in the Meta Data (experimental design) = sample info
# check the file
colData <- read.csv('sample_info.csv')
colData # shows the first 6 rows
## cellLine dexamethasone
## SRR1039508 N61311 untreated
## SRR1039509 N61311 treated
## SRR1039512 N052611 untreated
## SRR1039513 N052611 treated
## SRR1039516 N080611 untreated
## SRR1039517 N080611 treated
## SRR1039520 N061011 untreated
## SRR1039521 N061011 treated
dim(colData) # shows the dimensions of the table
## [1] 8 2
# Make sure the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))
## [1] TRUE
# Make sure everything is in the same order
all(colnames(counts_data) == rownames(colData))
## [1] TRUE
# Histogram to check the distribution of the count number in counts_data
# Convert all numeric columns into one vector to histogram
all_counts <- unlist(counts_data[ , sapply(counts_data, is.numeric)])
# Plot the histogram
hist(all_counts,
breaks = 1000, # adjust bins if needed
col = "steelblue",
main = "Histogram of All Counts",
xlab = "Counts",
xlim = c(0,4000), # adjust for outliers
ylab = "Frequency")
Comment results here:The histogram shows that most genes have low read counts, as expected for RNA-seq data, with a small number of highly expressed genes. This confirms the raw data follows a typical RNA-seq count distribution. .
dds <- DESeqDataSetFromMatrix(countData = counts_data,
colData = colData,
design = ~ dexamethasone)
#check dds, dim = dimension, same as counts data.
dds
## class: DESeqDataSet
## dim: 63677 8
## metadata(1): version
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
## ENSG00000273493
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Optional: Pre-filtering: removing rows with low gene counts
# keeping only rows (genes) that show at least 10 reads/counts total
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
# Check dds, should have a lower dim now
dds
## class: DESeqDataSet
## dim: 22369 8
## metadata(1): version
## assays(1): counts
## rownames(22369): ENSG00000000003 ENSG00000000419 ... ENSG00000273487
## ENSG00000273488
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Set the factor level for comparison
dds$dexamethasone <- relevel(dds$dexamethasone, ref = "untreated")
# Variance-stabilizing transformation (like log-scale normalization)
# This lets you check if one sample has very different expression distribution
# meaning → potential outlier.
vsd <- vst(dds, blind = FALSE)
# Boxplot of normalized counts per sample
boxplot(assay(vsd),
col = "lightblue",
las = 2,
main = "Boxplot of Normalized Counts (VST)",
ylab = "Normalized expression")
Comment results here: After variance-stabilizing transformation (VST), the boxplot displays uniform median expression across all samples, indicating successful normalization and absence of major outliers. .
dds <- DESeq(dds)
# Extract the results table
res <- results(dds)
# Check the results table log2-fold change, pvalue, adj-pvalue, for each gene
head(res)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.5979 -0.3788229 0.173155 -2.187769 0.0286865
## ENSG00000000419 520.2963 0.2037893 0.100742 2.022878 0.0430857
## ENSG00000000457 237.1621 0.0340631 0.126476 0.269325 0.7876795
## ENSG00000000460 57.9324 -0.1171564 0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108 0.4409793 0.258776 1.704099 0.0883626
## ENSG00000001036 1282.1007 -0.2419097 0.119713 -2.020751 0.0433055
## padj
## <numeric>
## ENSG00000000003 0.138470
## ENSG00000000419 0.182998
## ENSG00000000457 0.929805
## ENSG00000000460 0.894231
## ENSG00000000971 0.297042
## ENSG00000001036 0.183498
# Explore the results table
summary(res)
##
## out of 22369 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1884, 8.4%
## LFC < 0 (down) : 1502, 6.7%
## outliers [1] : 51, 0.23%
## low counts [2] : 3903, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Explore the results table with adj-pvalue < 0.05
res0.05 <- results(dds, alpha = 0.05)
summary(res0.05)
##
## out of 22369 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1555, 7%
## LFC < 0 (down) : 1158, 5.2%
## outliers [1] : 51, 0.23%
## low counts [2] : 4336, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
class(res)
## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
head(res0.05)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.5979 -0.3788229 0.173155 -2.187769 0.0286865
## ENSG00000000419 520.2963 0.2037893 0.100742 2.022878 0.0430857
## ENSG00000000457 237.1621 0.0340631 0.126476 0.269325 0.7876795
## ENSG00000000460 57.9324 -0.1171564 0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108 0.4409793 0.258776 1.704099 0.0883626
## ENSG00000001036 1282.1007 -0.2419097 0.119713 -2.020751 0.0433055
## padj
## <numeric>
## ENSG00000000003 0.135712
## ENSG00000000419 0.179481
## ENSG00000000457 0.927740
## ENSG00000000460 0.890936
## ENSG00000000971 0.292084
## ENSG00000001036 0.179969
Comment results here: Differential expression analysis revealed 1,884 upregulated and 1,502 downregulated genes (adjusted p < 0.1), suggesting the dexamethasone treatment induces significant transcriptomic changes in smooth muscle cells. .
# Create MA plot to check expression mean for each gene, versus log2-fold change
DESeq2::plotMA(res, ylim=c(-5,5))
# Create interactive MA plot
# Convert DESeq2 results to a data frame
resDF <- as.data.frame(res)
# Add gene names/IDs if available (rownames are gene IDs)
resDF$gene <- rownames(resDF)
# Create ggplot MA plot
pMA <- ggplot(resDF, aes(x=baseMean, y=log2FoldChange,
text = paste("ENSG: ", gene,
"<br>baseMean:", round(baseMean, 2),
"<br>log2FC:", round(log2FoldChange, 2),
"<br>padj:", signif(padj, 3)))) +
geom_point(aes(color = padj < 0.05), alpha=0.6) +
scale_x_log10() +
scale_color_manual(values = c("grey", "red")) +
labs(title="MA Plot", x="Mean Expression (baseMean)", y="Log2 Fold Change") +
theme_minimal()
# Convert ggplot to interactive plotly plot
# ggplotly(pMA, tooltip="text")
Comment results here:The MA plot shows most genes cluster around log2FC = 0, with red points (significant DEGs) scattered above and below, indicating both up- and down-regulated genes due to treatment. .
# Gene expression: create Data frames for Top 10 Up and Top 10 Down Genes
# Order genes by adjusted p-value
resOrdered <- res[order(res$padj), ]
head(resOrdered, 20)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 20 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000152583 997.445 4.60255 0.2117419 21.7366 9.24586e-105
## ENSG00000148175 11193.723 1.45149 0.0848958 17.0974 1.55286e-65
## ENSG00000179094 776.600 3.18388 0.2015094 15.8002 3.10285e-56
## ENSG00000134686 2737.982 1.38717 0.0916587 15.1341 9.65533e-52
## ENSG00000125148 3656.267 2.20347 0.1474191 14.9470 1.63015e-50
## ... ... ... ... ... ...
## ENSG00000166741 7432.96 2.16222 0.1775146 12.1805 3.94760e-34
## ENSG00000183044 498.69 1.15807 0.0971415 11.9215 9.14677e-33
## ENSG00000144369 1303.26 -1.35029 0.1161490 -11.6255 3.05683e-31
## ENSG00000198624 2057.20 2.89495 0.2506388 11.5503 7.35848e-31
## ENSG00000164125 5596.35 1.12456 0.0981153 11.4616 2.05624e-30
## padj
## <numeric>
## ENSG00000152583 1.70262e-100
## ENSG00000148175 1.42980e-61
## ENSG00000179094 1.90463e-52
## ENSG00000134686 4.44507e-48
## ENSG00000125148 6.00384e-47
## ... ...
## ENSG00000166741 4.54344e-31
## ENSG00000183044 9.90811e-30
## ENSG00000144369 3.12731e-28
## ENSG00000198624 7.13192e-28
## ENSG00000164125 1.89328e-27
# Select Up-regulated genes with log2FC > 0
upGenes <- subset(resOrdered, log2FoldChange > 0)
#top 10 up
topUp <- head(upGenes, 10)
# Select Down-regulated genes with log2FC < 0
downGenes <- subset(resOrdered, log2FoldChange < 0)
#top 10 down
topDown <- head(downGenes, 10)
# Save as dataframes
topUp_df <- as.data.frame(topUp)
topDown_df <- as.data.frame(topDown)
# View
topUp_df
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000152583 997.4447 4.602555 0.21174194 21.73662 9.245855e-105
## ENSG00000148175 11193.7232 1.451494 0.08489578 17.09736 1.552859e-65
## ENSG00000179094 776.6002 3.183883 0.20150941 15.80017 3.102848e-56
## ENSG00000134686 2737.9816 1.387168 0.09165870 15.13406 9.655331e-52
## ENSG00000125148 3656.2674 2.203468 0.14741913 14.94696 1.630150e-50
## ENSG00000120129 3409.0384 2.949013 0.20154644 14.63193 1.757388e-48
## ENSG00000189221 2341.7807 3.305227 0.22803072 14.49466 1.309529e-47
## ENSG00000109906 385.0713 7.176149 0.49635930 14.45757 2.245873e-47
## ENSG00000101347 12703.4128 3.857750 0.27932477 13.81099 2.188188e-43
## ENSG00000162614 5393.1145 2.029959 0.15170652 13.38083 7.826993e-41
## padj
## ENSG00000152583 1.702624e-100
## ENSG00000148175 1.429795e-61
## ENSG00000179094 1.904631e-52
## ENSG00000134686 4.445073e-48
## ENSG00000125148 6.003843e-47
## ENSG00000120129 5.393715e-45
## ENSG00000189221 3.444997e-44
## ENSG00000109906 5.169719e-44
## ENSG00000101347 4.029548e-40
## ENSG00000162614 1.310310e-37
topDown_df
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000178695 2649.8225 -2.499883 0.17890348 -13.97336 2.266696e-44
## ENSG00000196517 229.6380 -2.216939 0.16682841 -13.28874 2.690743e-40
## ENSG00000116584 2250.5809 -1.014492 0.08242501 -12.30807 8.196465e-35
## ENSG00000144369 1303.2632 -1.350294 0.11614901 -11.62553 3.056829e-31
## ENSG00000108821 100989.9160 -1.299471 0.11531121 -11.26925 1.861461e-29
## ENSG00000162692 508.1648 -3.609332 0.32642874 -11.05703 2.027005e-28
## ENSG00000181467 630.8696 -1.272452 0.11841431 -10.74576 6.204626e-27
## ENSG00000145777 152.3708 -2.786561 0.26171426 -10.64734 1.794184e-26
## ENSG00000163491 160.2233 -2.472566 0.23688626 -10.43778 1.666663e-25
## ENSG00000183160 2987.6473 -1.169122 0.11218874 -10.42103 1.987967e-25
## padj
## ENSG00000178695 4.637912e-41
## ENSG00000196517 4.129169e-37
## ENSG00000116584 1.006253e-31
## ENSG00000144369 3.127306e-28
## ENSG00000108821 1.632324e-26
## ENSG00000162692 1.555304e-25
## ENSG00000181467 3.939938e-24
## ENSG00000145777 1.032497e-23
## ENSG00000163491 8.769029e-23
## ENSG00000183160 1.016901e-22
Comment results here:The top upregulated genes (e.g., ENSG00000152583, ENSG00000148175) have high positive log2FC, while top downregulated genes (e.g., ENSG00000178695, ENSG00000196517) show strong negative log2FC, confirming clear transcriptional shifts. .
# Extract normalized expression values from VST object
vsd_mat <- assay(vsd)
# Function to build tidy dataframe for a set of genes
make_gene_df <- function(gene_list, label) {
df <- as.data.frame(t(vsd_mat[rownames(vsd_mat) %in% gene_list, ]))
df$sample <- rownames(df)
df <- pivot_longer(df, cols = -sample, names_to = "gene", values_to = "expression")
df <- merge(df, colData, by.x = "sample", by.y = "row.names")
df$direction <- label
# shorten gene labels to last 5 characters
df$gene_short <- substr(df$gene, nchar(df$gene)-4, nchar(df$gene))
# custom facet label: gene short ID + newline + direction
df$facet_label <- paste0(df$gene_short, "\n", label)
return(df)
}
# Collect top 10 upregulated and downregulated gene IDs
topUp_genes <- rownames(topUp_df)
topDown_genes <- rownames(topDown_df)
# Create tidy dataframes with custom labels
df_up <- make_gene_df(topUp_genes, "Upregulated")
df_down <- make_gene_df(topDown_genes, "Downregulated")
# Bind and enforce order: Upregulated first, then Downregulated
df_all <- rbind(df_up, df_down)
df_all$facet_label <- factor(df_all$facet_label,
levels = c(unique(df_up$facet_label), unique(df_down$facet_label)))
# Plot: 4 per row
ggplot(df_all, aes(x = dexamethasone, y = expression, fill = dexamethasone)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.5) +
facet_wrap(~ facet_label, scales = "free_y", ncol = 4) +
scale_fill_manual(values = c("untreated" = "skyblue", "treated" = "salmon")) +
labs(title = "Expression of Top 10 Up- and Down-Regulated Genes",
x = "Condition", y = "Normalized Expression (VST)") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(face = "bold"))
Comment results here:Boxplots illustrate consistent higher expression of upregulated genes and lower expression of downregulated ones in treated samples, confirming DESeq2 results visually. .
resDF <- as.data.frame(res) #already done for MA plot
resDF$gene <- rownames(res) #already done for MA plot
volcanop <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(pvalue),
text = paste("ENSG: ", gene,
"<br>log2FC:", round(log2FoldChange, 2),
"<br>p-adj:", signif(padj, 3)))) +
geom_point(aes(color = padj < 0.05), alpha=0.6) +
scale_color_manual(values = c("grey", "red")) +
theme_minimal() +
xlim(c(-6,6)) +
labs(title="Volcano Plot", x="Log2 Fold Change", y="-log10(p-value)")
volcanop
# Convert to interactive plot
#ggplotly(volcanop, tooltip = "text")
Comment rsults here:The volcano plot highlights significant DEGs (red points) with both large fold changes and small p-values, providing a clear overview of gene regulation magnitude and significance.
vsdata <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsdata, intgroup="dexamethasone", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=dexamethasone)) +
geom_point(size=4, alpha=0.8) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme_minimal() +
ggtitle("PCA of Samples")
# Create Interactive PCA
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = dexamethasone,
text = paste("Sample:", name,
"<br>Condition:", dexamethasone))) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme_minimal()
#interactive
# ggplotly(p, tooltip = "text")
Comment results here:PCA shows clear separation between treated and untreated samples along PC1, suggesting strong treatment effects on overall gene expression profiles and good reproducibility within each group.
# Add a column to mark direction
topUp_df$direction <- "Upregulated"
topDown_df$direction <- "Downregulated"
# Combine into one dataframe
topGenes_df <- rbind(topUp_df, topDown_df)
# Get top 20 genes by padj
topGenes_df <- head(rownames(topGenes_df), 20)
pheatmap(assay(vsdata)[topGenes_df, ],
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
annotation_col = colData,
scale = "row",
fontsize_row = 8,
main = "Heatmap of Top 10 Up/Down DEGs")
Comment results here:The clustered heatmap reveals distinct expression patterns between treated and untreated samples, with consistent grouping by condition, validating the DESeq2 analysis. .
# Get top 20 genes by padj
topGenes <- head(rownames(resOrdered), 20)
pheatmap(assay(vsdata)[topGenes, ],
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
annotation_col = colData,
scale = "row",
fontsize_row = 8,
main = "Heatmap of Top 20 DEGs (adj-pv)")
Comment results here:The heatmap of top 20 genes further confirms clear treatment-based clustering, reinforcing that dexamethasone significantly alters gene expression in smooth muscle cells. .
# Use biomaRt for ID conversion
mart <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
# Convert IDs for top 10 up & top 10 down
geneIDs <- c(rownames(topUp_df), rownames(topDown_df))
annot <- getBM(attributes=c("ensembl_gene_id",
"hgnc_symbol",
"uniprot_gn_id",
"refseq_mrna"),
filters="ensembl_gene_id",
values=geneIDs,
mart=mart)
# Merge annotation with results
topUp_annot <- merge(topUp_df, annot, by.x="row.names", by.y="ensembl_gene_id")
topDown_annot <- merge(topDown_df, annot, by.x="row.names", by.y="ensembl_gene_id")
# Final annotated dataframes
#topUp_annot
#topDown_annot
# Merge annotation into res dataframe
resDF <- as.data.frame(res)
resDF$ensembl_gene_id <- rownames(resDF)
resDF_annot <- merge(resDF, annot, by="ensembl_gene_id", all.x=TRUE)
# Add a label column for top genes
highlightGenes <- c(rownames(topUp_df), rownames(topDown_df))
resDF_annot$label <- ifelse(resDF_annot$ensembl_gene_id %in% highlightGenes,
resDF_annot$hgnc_symbol, NA)
# ggplot volcano
vPlot <- ggplot(resDF_annot,
aes(x=log2FoldChange, y=-log10(pvalue),
text=paste0("ENSG: ", ensembl_gene_id,
"<br>Gene: ", hgnc_symbol,
"<br>UniProt: ", uniprot_gn_id,
"<br>Log2FC: ", round(log2FoldChange,2),
"<br>padj: ", signif(padj,3)))) +
geom_jitter(aes(color = padj < 0.05), alpha=0.6) +
scale_color_manual(values=c("grey","red")) +
theme_minimal() +
xlim(c(-6,6)) +
labs(title="Interactive Volcano Plot",
x="Log2 Fold Change", y="-log10(p-value)") +
geom_text(aes(label=label), vjust=-1, size=3, na.rm=TRUE)
vPlot
# Convert to interactive with Plotly
# ggplotly(vPlot, tooltip="text")
Comment results here: Mapping Ensembl IDs to gene symbols successfully annotated the top DEGs, facilitating biological interpretation and downstream enrichment analyses. .
# Get significant DEGs (padj < 0.05)
sigGenes <- rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))]
# Remove version numbers
# Convert Ensembl IDs to Entrez IDs
sigGenes <- gsub("\\..*", "",
rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))])
entrez <- bitr(sigGenes,
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
head(entrez)
## ENSEMBL ENTREZID
## 1 ENSG00000002834 3927
## 2 ENSG00000003096 90293
## 3 ENSG00000003402 8837
## 4 ENSG00000003987 9108
## 5 ENSG00000004059 381
## 6 ENSG00000004487 23028
Comment results here:GO enrichment revealed biological processes related to inflammatory response, stress regulation, and glucocorticoid response, aligning with known effects of dexamethasone. .
#Take the significant DE genes and see what
#biological processes / pathways they’re enriched in.
ego <- enrichGO(gene=entrez$ENTREZID,
OrgDb=org.Hs.eg.db,
keyType="ENTREZID",
ont="BP", # Biological Process
pAdjustMethod="BH",
pvalueCutoff=0.05,
readable=TRUE)
# Dotplot
dotplot(ego, showCategory=20) + ggtitle("GO Biological Process Enrichment")
Comment results here:GO enrichment revealed biological processes related to inflammatory response, stress regulation, and glucocorticoid response, aligning with known effects of dexamethasone. .
ego_mf <- enrichGO(gene = entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
dotplot(ego_mf, showCategory=20) + ggtitle("GO: Molecular Function")
Comment results here: Enriched molecular functions include receptor binding, cytokine activity, and transcription regulation, consistent with dexamethasone’s impact on immune and signaling pathways. .
ego_cc <- enrichGO(gene = entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
dotplot(ego_cc, showCategory=20) + ggtitle("GO: Cellular Component")
Comment results here:Cellular component analysis indicates enrichment in cytosol, nucleus, and membrane-associated proteins, suggesting that dexamethasone influences multiple cellular compartments. .
#Check whether DE genes cluster in known metabolic or signaling pathways.
ekegg <- enrichKEGG(gene = entrez$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
dotplot(ekegg, showCategory=15) + ggtitle("KEGG Pathway Enrichment")
Comment results here:KEGG pathways enriched include cytokine–cytokine receptor interaction, TNF signaling, and glucocorticoid response pathways, supporting the observed anti-inflammatory effects. .
#Instead of just significant genes,
#rank all genes by log2FC and see if gene sets are enriched.
# Rank genes by log2FC
geneList <- res$log2FoldChange
names(geneList) <- gsub("\\..*", "", rownames(res))
geneList <- sort(geneList, decreasing=TRUE)
# Run GSEA
gseaRes <- gseGO(geneList=geneList,
OrgDb=org.Hs.eg.db,
ont="BP",
keyType="ENSEMBL",
pvalueCutoff=0.05)
ridgeplot(gseaRes) + ggtitle("GSEA of Biological Processes")
Comment results here:GSEA results show strong enrichment of pathways linked to immune modulation and cellular stress responses, highlighting systematic transcriptional reprogramming under treatment. .
# Perform Reactome pathway enrichment
ereactome <- enrichPathway(gene = entrez$ENTREZID,
organism = "human",
pvalueCutoff = 0.05,
readable = TRUE)
# View top enriched Reactome pathways
#head(as.data.frame(ereactome))
# Dotplot of top enriched Reactome pathways
dotplot(ereactome, showCategory = 20) +
ggtitle("Reactome Pathway Enrichment")
# Barplot alternative
barplot(ereactome, showCategory = 20, title = "Top Reactome Pathways")
# Network visualization (if desired)
cnetplot(ereactome, showCategory = 10, foldChange = NULL) +
ggtitle("Reactome Pathway - Gene Network")
# Save results to CSV for downstream analysis
write.csv(as.data.frame(ereactome), "Reactome_enrichment_results.csv", row.names = FALSE)
Comment results here:Reactome enrichment identified key pathways in signal transduction, immune system regulation, and metabolism. The network plots visualize how these pathways interact functionally. .
#If you have multiple contrasts, you can show overlap of significant DEGs.
# Example if you had multiple sets of DEGs: lik top_up, top_down,
#show that there s no intersection etween them
listInput <- list(
Up = rownames(topUp_df),
Down = rownames(topDown_df)
)
upset(fromList(listInput), order.by = "freq")
Comment results here: The UpSet plot confirms there is no overlap between top upregulated and downregulated gene sets, validating proper classification and distinct expression patterns. .
#First extract the Gene symols and the save as a csv
# Connect to Ensembl
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Function to map Ensembl IDs to gene symbols
ensembl_to_symbol <- function(ensembl_ids) {
mapping <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = mart
)
return(mapping)
}
#Add add column name "Ensembl_ID" to the first
#column of topUp_df and first column of topDown_df
# Move rownames into a real column called Ensembl_ID
topUp_df$Ensembl_ID <- rownames(topUp_df)
rownames(topUp_df) <- NULL
topDown_df$Ensembl_ID <- rownames(topDown_df)
rownames(topDown_df) <- NULL
#Now both will have Ensembl_ID as the first column (you can reorder if you want):
topUp_df <- topUp_df[, c("Ensembl_ID", setdiff(names(topUp_df), "Ensembl_ID"))]
topDown_df <- topDown_df[, c("Ensembl_ID", setdiff(names(topDown_df), "Ensembl_ID"))]
# Extract Ensembl IDs from your dataframes
up_ids <- topUp_df$Ensembl_ID
down_ids <- topDown_df$Ensembl_ID
# Map IDs
up_map <- ensembl_to_symbol(up_ids)
down_map <- ensembl_to_symbol(down_ids)
# Add regulation status
up_map$Status <- "Upregulated"
down_map$Status <- "Downregulated"
# Keep only 2 columns (GeneSymbol, Status)
up_map <- up_map[, c("hgnc_symbol", "Status")]
down_map <- down_map[, c("hgnc_symbol", "Status")]
# Combine
combined_df <- rbind(up_map, down_map)
# Rename columns
colnames(combined_df) <- c("GeneSymbol", "Status")
# Save to CSV
write.csv(combined_df, "combined_genes.csv", row.names = FALSE)
Comment results here: .
#First extract the Gene symbols and the save as a csv
# Order genes by adjusted p-value
top_50 <- res[order(res$padj), ]
top_50 <- top_50[1:50, ]
##
top_50$Ensembl_ID <- rownames(top_50)
rownames(top_50) <- NULL
# top50 will have Ensembl_ID as the first column (you can reorder if you want):
top_50 <- top_50[, c("Ensembl_ID", setdiff(names(top_50), "Ensembl_ID"))]
# Map Ensembl IDs to gene symbols
top_50$Gene_Symbol <- mapIds(
org.Hs.eg.db,
keys = top_50$Ensembl_ID,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first"
)
top_50$Status <- ifelse(top_50$log2FoldChange > 0, "Upregulated", "Downregulated")
# Keep only Ensembl_ID, Gene_Symbol, and adjusted p-value
out <- top_50[, c("Ensembl_ID", "Gene_Symbol", "log2FoldChange", "padj", "Status")]
# Write to CSV
write.csv(out, "top_50_genes.csv", row.names = FALSE)