load packages

# Core RNA-seq analysis
library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: generics

## 
## Attaching package: 'generics'

## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

# Data wrangling + plotting
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'stringr' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(pheatmap)
library(ggplot2)

# Functional enrichment
library(clusterProfiler)

## 
## clusterProfiler v4.16.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
## X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
## enrichment tool for interpreting omics data. The Innovation. 2021,
## 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## 
## The following object is masked from 'package:purrr':
## 
##     simplify
## 
## The following object is masked from 'package:IRanges':
## 
##     slice
## 
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## 
## The following object is masked from 'package:stats':
## 
##     filter

library(org.Hs.eg.db)

## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(ReactomePA)

## ReactomePA v1.52.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for
## reactome pathway analysis and visualization. Molecular BioSystems.
## 2016, 12(2):477-479

Read in counts and metadata

counts <- read.csv("C:/Users/anifo/Downloads/AML data/rawd_counts.csv", row.names = 1, check.names = FALSE)
meta   <- read.csv("C:/Users/anifo/Downloads/AML data/metadatadisease - metadatadisease.csv")

meta$DiseaseState <- factor(meta$Characteristics.DiseaseState.)

# Align metadata order with counts
meta <- meta[match(colnames(counts), meta$Extract.Name), ]

# Create the DESeq2 object
dds <- DESeqDataSetFromMatrix(
                countData = counts,
                colData   = meta,
                design    = ~ DiseaseState
)

Run DESeq2 and plot PCA

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 8063 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

res <- results(dds)

# First QC: variance-stabilized data + PCA
vsd <- vst(dds, blind = TRUE)
plotPCA(vsd, intgroup = "DiseaseState")

## using ntop=500 top features by variance

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

sample-sample distance heatmap (QC)

sampleDists <- dist(t(assay(vsd))) 
sampleDistMatrix <- as.matrix(sampleDists) 
rownames(sampleDistMatrix) <- vsd$DiseaseState 
colnames(sampleDistMatrix) <- vsd$DiseaseState 

pheatmap(sampleDistMatrix, 
         clustering_distance_rows = sampleDists, 
         clustering_distance_cols = sampleDists, 
         main = "Sample-to-sample distances")

Differential Expression results

resOrdered <- res[order(res$padj), ]
summary(res)

## 
## out of 58159 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2710, 4.7%
## LFC < 0 (down)     : 18042, 31%
## outliers [1]       : 0, 0%
## low counts [2]     : 13500, 23%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)

#To filter significant genes
sig <- res_df[!is.na(res_df$padj) & res_df$padj < 0.05, ]
sig <- sig[abs(sig$log2FoldChange) > 1, ]

# Extract the upregulated genes
up_genes <- sig[sig$log2FoldChange > 1, ]
up_list <- up_genes$gene

#Extract the downregulated genes
down_genes <- sig[sig$log2FoldChange < -1, ]
down_list <- down_genes$gene

length(up_list)

## [1] 410

length(down_list)

## [1] 8890

Visualize DE results

res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)

# Categorize genes
res_df$category <- "Not Sig"
res_df$category[res_df$padj < 0.05 & res_df$log2FoldChange > 1]  <- "Up"
res_df$category[res_df$padj < 0.05 & res_df$log2FoldChange < -1] <- "Down"

Volcano plot

ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), colour = category)) +
                geom_point(alpha = 0.7, size = 1.5) +
                scale_color_manual(values = c(
                                "Up" = "red",
                                "Down" = "blue",
                                "Not Sig" = "grey70"
                )) +
                theme_minimal() +
                labs(
                                title = "Volcano Plot",
                                x = "log2 Fold Change",
                                y = "-log10(FDR)"
                )

## Warning: Removed 16001 rows containing missing values or values outside the scale range
## (`geom_point()`).

MA plot

plotMA(res, ylim = c(-5, 5))

Prepare gene list for enrichment

geneList <- res$log2FoldChange
names(geneList) <- rownames(res)
geneList <- sort(geneList, decreasing = TRUE)

# To filter the gene IDs and convert to entrez ID
up_clean <- sub("\\..*", "", up_list)
down_clean <- sub("\\..*", "", down_list)

up_entrez <- bitr(up_clean,
                  fromType = "ENSEMBL",
                  toType = "ENTREZID",
                  OrgDb = org.Hs.eg.db)$ENTREZID

## 'select()' returned 1:many mapping between keys and columns

## Warning in bitr(up_clean, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 20.73% of input gene IDs are fail to map...

down_entrez <- bitr(down_clean,
                    fromType = "ENSEMBL",
                    toType = "ENTREZID",
                    OrgDb = org.Hs.eg.db)$ENTREZID

## 'select()' returned 1:many mapping between keys and columns

## Warning in bitr(down_clean, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 45.21% of input gene IDs are fail to map...

GO enrichment analysis for upregulated genes

ego_up <- enrichGO(
                gene          = up_clean,
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENSEMBL",   # or "SYMBOL"
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.05
)

GO enrichment analysis for downreulated genes

ego_down <- enrichGO(
                gene          = down_clean,
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENSEMBL",
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.05
)

Visualize

dotplot(ego_up, showCategory = 20)

dotplot(ego_down, showCategory = 20)

Running the KEGG analysis

kegg_up <- enrichKEGG(
                gene         = up_entrez,
                organism     = "hsa",
                pvalueCutoff = 0.05
)

## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...

## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

kegg_down <- enrichKEGG(
                gene         = down_entrez,
                organism     = "hsa",
                pvalueCutoff = 0.05
)

Plot the KEGG result

dotplot(kegg_up)

dotplot(kegg_down)

Reactome enrichment pathways

react_up <- enrichPathway(
                gene         = up_entrez,
                organism     = "human",
                pvalueCutoff = 0.05
)

react_down <- enrichPathway(
                gene         = down_entrez,
                organism     = "human",
                pvalueCutoff = 0.05
)

plot the Reactome enrichment pathways result

dotplot(react_up)

dotplot(react_down)

AML transcriptomics analysis

load packages

Read in counts and metadata

Run DESeq2 and plot PCA

sample-sample distance heatmap (QC)

Differential Expression results

Visualize DE results

Volcano plot

MA plot

Prepare gene list for enrichment

GO enrichment analysis for upregulated genes

GO enrichment analysis for downreulated genes

Visualize

Running the KEGG analysis

Plot the KEGG result

Reactome enrichment pathways

plot the Reactome enrichment pathways result