We will use DESeq2 package in R to run Differential Gene Expression. The analysys will use smooth muscle cells to check the transcriptome of normal cells and cells treated with asthma medication (GEO: GSE52778). This R version is modified from the following Bioconductor vignette: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#standard-workflow
Load Libraries
#install.packages("htmltools")
library(htmltools)
#BiocManager::install("rnaseqGene")
library(rnaseqGene)
#install.packages("plotly")
library(plotly)
#BiocManager::install("clusterProfiler", force = TRUE)
library(clusterProfiler)
#BiocManager::install("org.Hs.eg.db", force = TRUE)
library(org.Hs.eg.db)
#BiocManager::install("DOSE", force = TRUE)
library(DOSE)
#BiocManager::install("biomaRt", force = TRUE)
library(biomaRt)
#BiocManager::install("DESeq2", force = TRUE)
library(DESeq2)
#install.packages("pheatmap")
library(pheatmap)
#BiocManager::install("ReactomePA", force = TRUE)
library(ReactomePA)
#BiocManager::install("enrichplot", force = TRUE)
library(enrichplot)
#BiocManager::install("airway", force = TRUE)
#install.packages("airway")
library(airway)
library(ggplot2)
library(UpSetR)
library(tidyverse)
install.packages("ggridges")
Load the data
#load the data
# Retrieve and prepare counts data file (number of read counts for each expressed gene)
# Retrieve and prepare metadata (experimental design) file
#Both files from airway package (part of bioconductor)
data(airway)
#check the package: contains info on the experiment
airway
## class: RangedSummarizedExperiment
## dim: 63677 8
## metadata(1): ''
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
## ENSG00000273493
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
sample_info <- as.data.frame(colData(airway))
sample_info <- sample_info[,c(2,3)]
sample_info$dex <- gsub('trt', 'treated', sample_info$dex)
sample_info$dex <- gsub('untrt', 'untreated', sample_info$dex)
names(sample_info) <- c('cellLine', 'dexamethasone')
write.table(sample_info, file = "sample_info.csv", sep = ',', col.names = T,
row.names = T, quote = F)
countsData <- assay(airway)
write.table(countsData, file = "counts_data.csv", sep = ',', col.names = T,
row.names = T, quote = F)
Comment results here: .
# load the matrix that contains counts data
# check counts data
counts_data <- read.csv('counts_data.csv')
head(counts_data) # shows the first 6 rows
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
dim(counts_data) # shows the dimensions of the matrix
## [1] 63677 8
# load and read in the Meta Data (experimental design) = sample info
# check the file
colData <- read.csv('sample_info.csv')
colData # shows the first 6 rows
## cellLine dexamethasone
## SRR1039508 N61311 untreated
## SRR1039509 N61311 treated
## SRR1039512 N052611 untreated
## SRR1039513 N052611 treated
## SRR1039516 N080611 untreated
## SRR1039517 N080611 treated
## SRR1039520 N061011 untreated
## SRR1039521 N061011 treated
dim(colData) # shows the dimensions of the table
## [1] 8 2
# Make sure the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))
## [1] TRUE
# Make sure everything is in the same order
all(colnames(counts_data) == rownames(colData))
## [1] TRUE
# Histogram to check the distribution of the count number in counts_data
# Convert all numeric columns into one vector to histogram
all_counts <- unlist(counts_data[ , sapply(counts_data, is.numeric)])
# Plot the histogram
hist(all_counts,
breaks = 1000, # adjust bins if needed
col = "steelblue",
main = "Histogram of All Counts",
xlab = "Counts",
xlim = c(0,4000), # adjust for outliers
ylab = "Frequency")
Comment results here: Histogram - the most abundant count numbers is between 0 and 500 .
dds <- DESeqDataSetFromMatrix(countData = counts_data,
colData = colData,
design = ~ dexamethasone)
#check dds, dim = dimension, same as counts data.
dds
## class: DESeqDataSet
## dim: 63677 8
## metadata(1): version
## assays(1): counts
## rownames(63677): ENSG00000000003 ENSG00000000005 ... ENSG00000273492
## ENSG00000273493
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Optional: Pre-filtering: removing rows with low gene counts
# keeping only rows (genes) that show at least 10 reads/counts total
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
# Check dds, should have a lower dim now
dds
## class: DESeqDataSet
## dim: 22369 8
## metadata(1): version
## assays(1): counts
## rownames(22369): ENSG00000000003 ENSG00000000419 ... ENSG00000273487
## ENSG00000273488
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(2): cellLine dexamethasone
# Set the factor level for comparison
dds$dexamethasone <- relevel(dds$dexamethasone, ref = "untreated")
# Variance-stabilizing transformation (like log-scale normalization)
# This lets you check if one sample has very different expression distribution
# meaning → potential outlier.
vsd <- vst(dds, blind = FALSE)
# Boxplot of normalized counts per sample
boxplot(assay(vsd),
col = "lightblue",
las = 2,
main = "Boxplot of Normalized Counts (VST)",
ylab = "Normalized expression")
Comment results here: The normalized expression is equal for each of the samples which is what we want to see - we want to see that we can now compare biological variation .
dds <- DESeq(dds)
# Extract the results table
res <- results(dds)
# Check the results table log2-fold change, pvalue, adj-pvalue, for each gene
head(res)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.5979 -0.3788229 0.173155 -2.187769 0.0286865
## ENSG00000000419 520.2963 0.2037893 0.100742 2.022878 0.0430857
## ENSG00000000457 237.1621 0.0340631 0.126476 0.269325 0.7876795
## ENSG00000000460 57.9324 -0.1171564 0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108 0.4409793 0.258776 1.704099 0.0883626
## ENSG00000001036 1282.1007 -0.2419097 0.119713 -2.020751 0.0433055
## padj
## <numeric>
## ENSG00000000003 0.138470
## ENSG00000000419 0.182998
## ENSG00000000457 0.929805
## ENSG00000000460 0.894231
## ENSG00000000971 0.297042
## ENSG00000001036 0.183498
# Explore the results table
summary(res)
##
## out of 22369 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1884, 8.4%
## LFC < 0 (down) : 1502, 6.7%
## outliers [1] : 51, 0.23%
## low counts [2] : 3903, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Explore the results table with adj-pvalue < 0.05
res0.05 <- results(dds, alpha = 0.05)
summary(res0.05)
##
## out of 22369 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1555, 7%
## LFC < 0 (down) : 1158, 5.2%
## outliers [1] : 51, 0.23%
## low counts [2] : 4336, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
class(res)
## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
head(res0.05)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.5979 -0.3788229 0.173155 -2.187769 0.0286865
## ENSG00000000419 520.2963 0.2037893 0.100742 2.022878 0.0430857
## ENSG00000000457 237.1621 0.0340631 0.126476 0.269325 0.7876795
## ENSG00000000460 57.9324 -0.1171564 0.301583 -0.388472 0.6976669
## ENSG00000000971 5817.3108 0.4409793 0.258776 1.704099 0.0883626
## ENSG00000001036 1282.1007 -0.2419097 0.119713 -2.020751 0.0433055
## padj
## <numeric>
## ENSG00000000003 0.135712
## ENSG00000000419 0.179481
## ENSG00000000457 0.927740
## ENSG00000000460 0.890936
## ENSG00000000971 0.292084
## ENSG00000001036 0.179969
Comment results here: This shows the adjusted p values for the samples - we see that none of the samples are significant .
# Create MA plot to check expression mean for each gene, versus log2-fold change
DESeq2::plotMA(res, ylim=c(-5,5))
# Create interactive MA plot
# Convert DESeq2 results to a data frame
resDF <- as.data.frame(res)
# Add gene names/IDs if available (rownames are gene IDs)
resDF$gene <- rownames(resDF)
# Create ggplot MA plot
pMA <- ggplot(resDF, aes(x=baseMean, y=log2FoldChange,
text = paste("ENSG: ", gene,
"<br>baseMean:", round(baseMean, 2),
"<br>log2FC:", round(log2FoldChange, 2),
"<br>padj:", signif(padj, 3)))) +
geom_point(aes(color = padj < 0.05), alpha=0.6) +
scale_x_log10() +
scale_color_manual(values = c("grey", "red")) +
labs(title="MA Plot", x="Mean Expression (baseMean)", y="Log2 Fold Change") +
theme_minimal()
# Convert ggplot to interactive plotly plot
# ggplotly(pMA, tooltip="text")
Comment results here: For the MA plot, we want to focus on the normalized counts that are further to the right of the graph - away from the low value of the mean normalized counts - anything about the gray area (0) is up regulated and anything below is down regulated .
# Gene expression: create Data frames for Top 10 Up and Top 10 Down Genes
# Order genes by adjusted p-value
resOrdered <- res[order(res$padj), ]
head(resOrdered, 20)
## log2 fold change (MLE): dexamethasone treated vs untreated
## Wald test p-value: dexamethasone treated vs untreated
## DataFrame with 20 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000152583 997.445 4.60255 0.2117419 21.7366 9.24586e-105
## ENSG00000148175 11193.723 1.45149 0.0848958 17.0974 1.55286e-65
## ENSG00000179094 776.600 3.18388 0.2015094 15.8002 3.10285e-56
## ENSG00000134686 2737.982 1.38717 0.0916587 15.1341 9.65533e-52
## ENSG00000125148 3656.267 2.20347 0.1474191 14.9470 1.63015e-50
## ... ... ... ... ... ...
## ENSG00000166741 7432.96 2.16222 0.1775146 12.1805 3.94760e-34
## ENSG00000183044 498.69 1.15807 0.0971415 11.9215 9.14677e-33
## ENSG00000144369 1303.26 -1.35029 0.1161490 -11.6255 3.05683e-31
## ENSG00000198624 2057.20 2.89495 0.2506388 11.5503 7.35848e-31
## ENSG00000164125 5596.35 1.12456 0.0981153 11.4616 2.05624e-30
## padj
## <numeric>
## ENSG00000152583 1.70262e-100
## ENSG00000148175 1.42980e-61
## ENSG00000179094 1.90463e-52
## ENSG00000134686 4.44507e-48
## ENSG00000125148 6.00384e-47
## ... ...
## ENSG00000166741 4.54344e-31
## ENSG00000183044 9.90811e-30
## ENSG00000144369 3.12731e-28
## ENSG00000198624 7.13192e-28
## ENSG00000164125 1.89328e-27
# Select Up-regulated genes with log2FC > 0
upGenes <- subset(resOrdered, log2FoldChange > 0)
#top 10 up
topUp <- head(upGenes, 10)
# Select Down-regulated genes with log2FC < 0
downGenes <- subset(resOrdered, log2FoldChange < 0)
#top 10 down
topDown <- head(downGenes, 10)
# Save as dataframes
topUp_df <- as.data.frame(topUp)
topDown_df <- as.data.frame(topDown)
# View
topUp_df
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000152583 997.4447 4.602555 0.21174194 21.73662 9.245855e-105
## ENSG00000148175 11193.7232 1.451494 0.08489578 17.09736 1.552860e-65
## ENSG00000179094 776.6002 3.183883 0.20150941 15.80017 3.102848e-56
## ENSG00000134686 2737.9816 1.387168 0.09165870 15.13406 9.655331e-52
## ENSG00000125148 3656.2674 2.203468 0.14741913 14.94696 1.630150e-50
## ENSG00000120129 3409.0384 2.949013 0.20154644 14.63193 1.757388e-48
## ENSG00000189221 2341.7807 3.305227 0.22803072 14.49466 1.309529e-47
## ENSG00000109906 385.0713 7.176149 0.49635930 14.45757 2.245873e-47
## ENSG00000101347 12703.4128 3.857750 0.27932477 13.81099 2.188188e-43
## ENSG00000162614 5393.1145 2.029959 0.15170652 13.38083 7.826993e-41
## padj
## ENSG00000152583 1.702624e-100
## ENSG00000148175 1.429795e-61
## ENSG00000179094 1.904631e-52
## ENSG00000134686 4.445073e-48
## ENSG00000125148 6.003843e-47
## ENSG00000120129 5.393715e-45
## ENSG00000189221 3.444997e-44
## ENSG00000109906 5.169719e-44
## ENSG00000101347 4.029548e-40
## ENSG00000162614 1.310310e-37
topDown_df
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000178695 2649.8225 -2.499883 0.17890348 -13.97336 2.266696e-44
## ENSG00000196517 229.6380 -2.216939 0.16682841 -13.28874 2.690743e-40
## ENSG00000116584 2250.5809 -1.014492 0.08242501 -12.30807 8.196465e-35
## ENSG00000144369 1303.2632 -1.350294 0.11614901 -11.62553 3.056829e-31
## ENSG00000108821 100989.9160 -1.299471 0.11531121 -11.26925 1.861461e-29
## ENSG00000162692 508.1648 -3.609332 0.32642874 -11.05703 2.027005e-28
## ENSG00000181467 630.8696 -1.272452 0.11841431 -10.74576 6.204626e-27
## ENSG00000145777 152.3708 -2.786561 0.26171426 -10.64734 1.794184e-26
## ENSG00000163491 160.2233 -2.472566 0.23688626 -10.43778 1.666663e-25
## ENSG00000183160 2987.6473 -1.169122 0.11218874 -10.42103 1.987967e-25
## padj
## ENSG00000178695 4.637912e-41
## ENSG00000196517 4.129169e-37
## ENSG00000116584 1.006253e-31
## ENSG00000144369 3.127306e-28
## ENSG00000108821 1.632324e-26
## ENSG00000162692 1.555304e-25
## ENSG00000181467 3.939938e-24
## ENSG00000145777 1.032497e-23
## ENSG00000163491 8.769029e-23
## ENSG00000183160 1.016901e-22
Comment results here: This shows us the adjusted p value and we see that all 10 samples are significant. .
# Extract normalized expression values from VST object
vsd_mat <- assay(vsd)
# Function to build tidy dataframe for a set of genes
make_gene_df <- function(gene_list, label) {
df <- as.data.frame(t(vsd_mat[rownames(vsd_mat) %in% gene_list, ]))
df$sample <- rownames(df)
df <- pivot_longer(df, cols = -sample, names_to = "gene", values_to = "expression")
df <- merge(df, colData, by.x = "sample", by.y = "row.names")
df$direction <- label
# shorten gene labels to last 5 characters
df$gene_short <- substr(df$gene, nchar(df$gene)-4, nchar(df$gene))
# custom facet label: gene short ID + newline + direction
df$facet_label <- paste0(df$gene_short, "\n", label)
return(df)
}
# Collect top 10 upregulated and downregulated gene IDs
topUp_genes <- rownames(topUp_df)
topDown_genes <- rownames(topDown_df)
# Create tidy dataframes with custom labels
df_up <- make_gene_df(topUp_genes, "Upregulated")
df_down <- make_gene_df(topDown_genes, "Downregulated")
# Bind and enforce order: Upregulated first, then Downregulated
df_all <- rbind(df_up, df_down)
df_all$facet_label <- factor(df_all$facet_label,
levels = c(unique(df_up$facet_label), unique(df_down$facet_label)))
# Plot: 4 per row
ggplot(df_all, aes(x = dexamethasone, y = expression, fill = dexamethasone)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.5) +
facet_wrap(~ facet_label, scales = "free_y", ncol = 4) +
scale_fill_manual(values = c("untreated" = "skyblue", "treated" = "salmon")) +
labs(title = "Expression of Top 10 Up- and Down-Regulated Genes",
x = "Condition", y = "Normalized Expression (VST)") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(face = "bold"))
Comment results here: These graphs show us what samples had up regulation and down regulation for the treated and untreated samples .
resDF <- as.data.frame(res) #already done for MA plot
resDF$gene <- rownames(res) #already done for MA plot
volcanop <- ggplot(resDF, aes(x=log2FoldChange, y=-log10(pvalue),
text = paste("ENSG: ", gene,
"<br>log2FC:", round(log2FoldChange, 2),
"<br>p-adj:", signif(padj, 3)))) +
geom_point(aes(color = padj < 0.05), alpha=0.6) +
scale_color_manual(values = c("grey", "red")) +
theme_minimal() +
xlim(c(-6,6)) +
labs(title="Volcano Plot", x="Log2 Fold Change", y="-log10(p-value)")
volcanop
# Convert to interactive plot
#ggplotly(volcanop, tooltip = "text")
Comment results here: This is a volcano plot and the gray areas are not significant but the dots in red are significant. Anything to the right of 0 is up regulated and anything to the left of 0 is down regulated .
vsdata <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsdata, intgroup="dexamethasone", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=dexamethasone)) +
geom_point(size=4, alpha=0.8) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme_minimal() +
ggtitle("PCA of Samples")
# Create Interactive PCA
p <- ggplot(pcaData, aes(x = PC1, y = PC2, color = dexamethasone,
text = paste("Sample:", name,
"<br>Condition:", dexamethasone))) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme_minimal()
#interactive
# ggplotly(p, tooltip = "text")
Comment results here: We are looking here to see if the untreated samples are clustered together and the treated samples are also clustered – but we want to make sure that the treated and untreated are not clustered together - in this graph, we see that each treatment is clustered together which tells us that there is possibly something biological that should be investigated .
# Add a column to mark direction
topUp_df$direction <- "Upregulated"
topDown_df$direction <- "Downregulated"
# Combine into one dataframe
topGenes_df <- rbind(topUp_df, topDown_df)
# Get top 20 genes by padj
topGenes_df <- head(rownames(topGenes_df), 20)
pheatmap(assay(vsdata)[topGenes_df, ],
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
annotation_col = colData,
scale = "row",
fontsize_row = 8,
main = "Heatmap of Top 10 Up/Down DEGs")
Comment results here: This heatmap shows the top 10 samples for up regulation and the top 10 samples for down regulation .
# Get top 20 genes by padj
topGenes <- head(rownames(resOrdered), 20)
pheatmap(assay(vsdata)[topGenes, ],
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
annotation_col = colData,
scale = "row",
fontsize_row = 8,
main = "Heatmap of Top 20 DEGs (adj-pv)")
Comment results here: This heatmap shows the top 20 samples for either up regulation or down regulation. We see that for the untreated, the top four are up regulated while it is the opposite for the treated group. .
# Use biomaRt for ID conversion
mart <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
# Convert IDs for top 10 up & top 10 down
geneIDs <- c(rownames(topUp_df), rownames(topDown_df))
annot <- getBM(attributes=c("ensembl_gene_id",
"hgnc_symbol",
"uniprot_gn_id",
"refseq_mrna"),
filters="ensembl_gene_id",
values=geneIDs,
mart=mart)
# Merge annotation with results
topUp_annot <- merge(topUp_df, annot, by.x="row.names", by.y="ensembl_gene_id")
topDown_annot <- merge(topDown_df, annot, by.x="row.names", by.y="ensembl_gene_id")
# Final annotated dataframes
#topUp_annot
#topDown_annot
# Merge annotation into res dataframe
resDF <- as.data.frame(res)
resDF$ensembl_gene_id <- rownames(resDF)
resDF_annot <- merge(resDF, annot, by="ensembl_gene_id", all.x=TRUE)
# Add a label column for top genes
highlightGenes <- c(rownames(topUp_df), rownames(topDown_df))
resDF_annot$label <- ifelse(resDF_annot$ensembl_gene_id %in% highlightGenes,
resDF_annot$hgnc_symbol, NA)
# ggplot volcano
vPlot <- ggplot(resDF_annot,
aes(x=log2FoldChange, y=-log10(pvalue),
text=paste0("ENSG: ", ensembl_gene_id,
"<br>Gene: ", hgnc_symbol,
"<br>UniProt: ", uniprot_gn_id,
"<br>Log2FC: ", round(log2FoldChange,2),
"<br>padj: ", signif(padj,3)))) +
geom_jitter(aes(color = padj < 0.05), alpha=0.6) +
scale_color_manual(values=c("grey","red")) +
theme_minimal() +
xlim(c(-6,6)) +
labs(title="Interactive Volcano Plot",
x="Log2 Fold Change", y="-log10(p-value)") +
geom_text(aes(label=label), vjust=-1, size=3, na.rm=TRUE)
vPlot
# Convert to interactive with Plotly
# ggplotly(vPlot, tooltip="text")
Comment results here: This is an interactive volcano plot so you can identify the exact data for each dot by hovering over it. Again we want to focus on the red dots which are significant. Anything to the right is up regulated and anything to the left is down regulated - KCTD12 is down regulated and STOM is up regulated .
# Get significant DEGs (padj < 0.05)
sigGenes <- rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))]
# Remove version numbers
# Convert Ensembl IDs to Entrez IDs
sigGenes <- gsub("\\..*", "",
rownames(res)[which(res$padj < 0.05 & !is.na(res$padj))])
entrez <- bitr(sigGenes,
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
head(entrez)
## ENSEMBL ENTREZID
## 1 ENSG00000002834 3927
## 2 ENSG00000003096 90293
## 3 ENSG00000003402 8837
## 4 ENSG00000003987 9108
## 5 ENSG00000004059 381
## 6 ENSG00000004487 23028
Comment results here: This converts identification from ENSEMBL to ENTREZID .
#Take the significant DE genes and see what
#biological processes / pathways they’re enriched in.
ego <- enrichGO(gene=entrez$ENTREZID,
OrgDb=org.Hs.eg.db,
keyType="ENTREZID",
ont="BP", # Biological Process
pAdjustMethod="BH",
pvalueCutoff=0.05,
readable=TRUE)
# Dotplot
dotplot(ego, showCategory=20) + ggtitle("GO Biological Process Enrichment")
Comment results here: This shows us the ratio of the genes to their biological processes. The processes at the top have a higher amount of the genes participating in that process. Biological process describes pathways and larger biological objectives that genes contribute to - we see that this gene has a high ratio for actin filament organization
ego_mf <- enrichGO(gene = entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
dotplot(ego_mf, showCategory=20) + ggtitle("GO: Molecular Function")
Comment results here: This shows the gene ratio compared to the molecular function - what is the function of the gene and how many of them are involved in the processes - molecular function defines the biochemical activities gene produces such as enzyme activity - we see this gene functions with DNA-binding transcription .
ego_cc <- enrichGO(gene = entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
dotplot(ego_cc, showCategory=20) + ggtitle("GO: Cellular Component")
Comment results here: This shows the gene ratio compared to the cellular component involved. It shows where in the cell the gene is active and how many of them are involved with that part of the cell - cellular component specifies where gene products are active within the cell - where find the products of this gene are active at cell-substrate juntions .
#Check whether DE genes cluster in known metabolic or signaling pathways.
ekegg <- enrichKEGG(gene = entrez$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
dotplot(ekegg, showCategory=15) + ggtitle("KEGG Pathway Enrichment")
Comment results here: This shows the links between genomic information and higher-order functional pathways - this helps to identify which pathways are enriched or disrupted - we see an enrichment PI3K-Akt signaling pathway is enriched
#Instead of just significant genes,
#rank all genes by log2FC and see if gene sets are enriched.
# Rank genes by log2FC
geneList <- res$log2FoldChange
names(geneList) <- gsub("\\..*", "", rownames(res))
geneList <- sort(geneList, decreasing=TRUE)
# Run GSEA
gseaRes <- gseGO(geneList=geneList,
OrgDb=org.Hs.eg.db,
ont="BP",
keyType="ENSEMBL",
pvalueCutoff=0.05)
ridgeplot(gseaRes) + ggtitle("GSEA of Biological Processes")
Comment results here: GSEA tests enrichment of a predefined gene set across conditions - this show biological pathways or functions that are consistently up regulated or down regulated - we see that temperature stimulus is consistently down regulated
# Perform Reactome pathway enrichment
ereactome <- enrichPathway(gene = entrez$ENTREZID,
organism = "human",
pvalueCutoff = 0.05,
readable = TRUE)
# View top enriched Reactome pathways
#head(as.data.frame(ereactome))
# Dotplot of top enriched Reactome pathways
dotplot(ereactome, showCategory = 20) +
ggtitle("Reactome Pathway Enrichment")
# Barplot alternative
barplot(ereactome, showCategory = 20, title = "Top Reactome Pathways")
# Network visualization (if desired)
cnetplot(ereactome, showCategory = 10, foldChange = NULL) +
ggtitle("Reactome Pathway - Gene Network")
# Save results to CSV for downstream analysis
write.csv(as.data.frame(ereactome), "Reactome_enrichment_results.csv", row.names = FALSE)
Comment results here: We see the top reactome pathway is the RHO GTPase pathway
#If you have multiple contrasts, you can show overlap of significant DEGs.
# Example if you had multiple sets of DEGs: lik top_up, top_down,
#show that there s no intersection between them
listInput <- list(
Up = rownames(topUp_df),
Down = rownames(topDown_df)
)
upset(fromList(listInput), order.by = "freq")
Comment results here: .
#First extract the Gene symols and the save as a csv
# Connect to Ensembl
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Function to map Ensembl IDs to gene symbols
ensembl_to_symbol <- function(ensembl_ids) {
mapping <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = mart
)
return(mapping)
}
#Add add column name "Ensembl_ID" to the first
#column of topUp_df and first column of topDown_df
# Move rownames into a real column called Ensembl_ID
topUp_df$Ensembl_ID <- rownames(topUp_df)
rownames(topUp_df) <- NULL
topDown_df$Ensembl_ID <- rownames(topDown_df)
rownames(topDown_df) <- NULL
#Now both will have Ensembl_ID as the first column (you can reorder if you want):
topUp_df <- topUp_df[, c("Ensembl_ID", setdiff(names(topUp_df), "Ensembl_ID"))]
topDown_df <- topDown_df[, c("Ensembl_ID", setdiff(names(topDown_df), "Ensembl_ID"))]
# Extract Ensembl IDs from your dataframes
up_ids <- topUp_df$Ensembl_ID
down_ids <- topDown_df$Ensembl_ID
# Map IDs
up_map <- ensembl_to_symbol(up_ids)
down_map <- ensembl_to_symbol(down_ids)
# Add regulation status
up_map$Status <- "Upregulated"
down_map$Status <- "Downregulated"
# Keep only 2 columns (GeneSymbol, Status)
up_map <- up_map[, c("hgnc_symbol", "Status")]
down_map <- down_map[, c("hgnc_symbol", "Status")]
# Combine
combined_df <- rbind(up_map, down_map)
# Rename columns
colnames(combined_df) <- c("GeneSymbol", "Status")
# Save to CSV
write.csv(combined_df, "combined_genes.csv", row.names = FALSE)
Comment results here: .
#First extract the Gene symbols and the save as a csv
# Order genes by adjusted p-value
top_50 <- res[order(res$padj), ]
top_50 <- top_50[1:50, ]
##
top_50$Ensembl_ID <- rownames(top_50)
rownames(top_50) <- NULL
# top50 will have Ensembl_ID as the first column (you can reorder if you want):
top_50 <- top_50[, c("Ensembl_ID", setdiff(names(top_50), "Ensembl_ID"))]
# Map Ensembl IDs to gene symbols
top_50$Gene_Symbol <- mapIds(
org.Hs.eg.db,
keys = top_50$Ensembl_ID,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first"
)
top_50$Status <- ifelse(top_50$log2FoldChange > 0, "Upregulated", "Downregulated")
# Keep only Ensembl_ID, Gene_Symbol, and adjusted p-value
out <- top_50[, c("Ensembl_ID", "Gene_Symbol", "log2FoldChange", "padj", "Status")]
# Write to CSV
write.csv(out, "top_50_genes.csv", row.names = FALSE)