load packages

# Core RNA-seq analysis
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: generics
## 
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
# Data wrangling + plotting
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pheatmap)
library(ggplot2)

# Functional enrichment
library(clusterProfiler)
## 
## clusterProfiler v4.16.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
## X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
## enrichment tool for interpreting omics data. The Innovation. 2021,
## 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## 
## The following object is masked from 'package:purrr':
## 
##     simplify
## 
## The following object is masked from 'package:IRanges':
## 
##     slice
## 
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(ReactomePA)
## ReactomePA v1.52.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for
## reactome pathway analysis and visualization. Molecular BioSystems.
## 2016, 12(2):477-479

Read in counts and metadata

counts <- read.csv("C:/Users/anifo/Downloads/AML data/rawd_counts.csv", row.names = 1, check.names = FALSE)
meta   <- read.csv("C:/Users/anifo/Downloads/AML data/metadatadisease - metadatadisease.csv")

meta$DiseaseState <- factor(meta$Characteristics.DiseaseState.)

# Align metadata order with counts
meta <- meta[match(colnames(counts), meta$Extract.Name), ]

# Create the DESeq2 object
dds <- DESeqDataSetFromMatrix(
                countData = counts,
                colData   = meta,
                design    = ~ DiseaseState
)

Run DESeq2 and plot PCA

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 8063 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds)

# First QC: variance-stabilized data + PCA
vsd <- vst(dds, blind = TRUE)
plotPCA(vsd, intgroup = "DiseaseState")
## using ntop=500 top features by variance
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

sample-sample distance heatmap (QC)

sampleDists <- dist(t(assay(vsd))) 
sampleDistMatrix <- as.matrix(sampleDists) 
rownames(sampleDistMatrix) <- vsd$DiseaseState 
colnames(sampleDistMatrix) <- vsd$DiseaseState 

pheatmap(sampleDistMatrix, 
         clustering_distance_rows = sampleDists, 
         clustering_distance_cols = sampleDists, 
         main = "Sample-to-sample distances")

Differential Expression results

resOrdered <- res[order(res$padj), ]
summary(res)
## 
## out of 58159 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2710, 4.7%
## LFC < 0 (down)     : 18042, 31%
## outliers [1]       : 0, 0%
## low counts [2]     : 13500, 23%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)

#To filter significant genes
sig <- res_df[!is.na(res_df$padj) & res_df$padj < 0.05, ]
sig <- sig[abs(sig$log2FoldChange) > 1, ]

# Extract the upregulated genes
up_genes <- sig[sig$log2FoldChange > 1, ]
up_list <- up_genes$gene

#Extract the downregulated genes
down_genes <- sig[sig$log2FoldChange < -1, ]
down_list <- down_genes$gene

length(up_list)
## [1] 410
length(down_list)
## [1] 8890

Visualize DE results

res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)

# Categorize genes
res_df$category <- "Not Sig"
res_df$category[res_df$padj < 0.05 & res_df$log2FoldChange > 1]  <- "Up"
res_df$category[res_df$padj < 0.05 & res_df$log2FoldChange < -1] <- "Down"

Volcano plot

ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), colour = category)) +
                geom_point(alpha = 0.7, size = 1.5) +
                scale_color_manual(values = c(
                                "Up" = "red",
                                "Down" = "blue",
                                "Not Sig" = "grey70"
                )) +
                theme_minimal() +
                labs(
                                title = "Volcano Plot",
                                x = "log2 Fold Change",
                                y = "-log10(FDR)"
                )
## Warning: Removed 16001 rows containing missing values or values outside the scale range
## (`geom_point()`).

MA plot

plotMA(res, ylim = c(-5, 5))

Prepare gene list for enrichment

geneList <- res$log2FoldChange
names(geneList) <- rownames(res)
geneList <- sort(geneList, decreasing = TRUE)

# To filter the gene IDs and convert to entrez ID
up_clean <- sub("\\..*", "", up_list)
down_clean <- sub("\\..*", "", down_list)

up_entrez <- bitr(up_clean,
                  fromType = "ENSEMBL",
                  toType = "ENTREZID",
                  OrgDb = org.Hs.eg.db)$ENTREZID
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up_clean, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 20.73% of input gene IDs are fail to map...
down_entrez <- bitr(down_clean,
                    fromType = "ENSEMBL",
                    toType = "ENTREZID",
                    OrgDb = org.Hs.eg.db)$ENTREZID
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down_clean, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 45.21% of input gene IDs are fail to map...

GO enrichment analysis for upregulated genes

ego_up <- enrichGO(
                gene          = up_clean,
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENSEMBL",   # or "SYMBOL"
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.05
)

GO enrichment analysis for downreulated genes

ego_down <- enrichGO(
                gene          = down_clean,
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENSEMBL",
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.05
)

Visualize

dotplot(ego_up, showCategory = 20)

dotplot(ego_down, showCategory = 20)

Running the KEGG analysis

kegg_up <- enrichKEGG(
                gene         = up_entrez,
                organism     = "hsa",
                pvalueCutoff = 0.05
)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
kegg_down <- enrichKEGG(
                gene         = down_entrez,
                organism     = "hsa",
                pvalueCutoff = 0.05
)

Plot the KEGG result

dotplot(kegg_up)

dotplot(kegg_down)

Reactome enrichment pathways

react_up <- enrichPathway(
                gene         = up_entrez,
                organism     = "human",
                pvalueCutoff = 0.05
)

react_down <- enrichPathway(
                gene         = down_entrez,
                organism     = "human",
                pvalueCutoff = 0.05
)

plot the Reactome enrichment pathways result

dotplot(react_up)

dotplot(react_down)