# Core RNA-seq analysis
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
# Data wrangling + plotting
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pheatmap)
library(ggplot2)
# Functional enrichment
library(clusterProfiler)
##
## clusterProfiler v4.16.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
## X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
## enrichment tool for interpreting omics data. The Innovation. 2021,
## 2(3):100141
##
## Attaching package: 'clusterProfiler'
##
## The following object is masked from 'package:purrr':
##
## simplify
##
## The following object is masked from 'package:IRanges':
##
## slice
##
## The following object is masked from 'package:S4Vectors':
##
## rename
##
## The following object is masked from 'package:stats':
##
## filter
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:clusterProfiler':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
library(ReactomePA)
## ReactomePA v1.52.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for
## reactome pathway analysis and visualization. Molecular BioSystems.
## 2016, 12(2):477-479
counts <- read.csv("C:/Users/anifo/Downloads/AML data/rawd_counts.csv", row.names = 1, check.names = FALSE)
meta <- read.csv("C:/Users/anifo/Downloads/AML data/metadatadisease - metadatadisease.csv")
meta$DiseaseState <- factor(meta$Characteristics.DiseaseState.)
# Align metadata order with counts
meta <- meta[match(colnames(counts), meta$Extract.Name), ]
# Create the DESeq2 object
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = meta,
design = ~ DiseaseState
)
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 8063 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds)
# First QC: variance-stabilized data + PCA
vsd <- vst(dds, blind = TRUE)
plotPCA(vsd, intgroup = "DiseaseState")
## using ntop=500 top features by variance
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
## Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- vsd$DiseaseState
colnames(sampleDistMatrix) <- vsd$DiseaseState
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
main = "Sample-to-sample distances")
resOrdered <- res[order(res$padj), ]
summary(res)
##
## out of 58159 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2710, 4.7%
## LFC < 0 (down) : 18042, 31%
## outliers [1] : 0, 0%
## low counts [2] : 13500, 23%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)
#To filter significant genes
sig <- res_df[!is.na(res_df$padj) & res_df$padj < 0.05, ]
sig <- sig[abs(sig$log2FoldChange) > 1, ]
# Extract the upregulated genes
up_genes <- sig[sig$log2FoldChange > 1, ]
up_list <- up_genes$gene
#Extract the downregulated genes
down_genes <- sig[sig$log2FoldChange < -1, ]
down_list <- down_genes$gene
length(up_list)
## [1] 410
length(down_list)
## [1] 8890
res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)
# Categorize genes
res_df$category <- "Not Sig"
res_df$category[res_df$padj < 0.05 & res_df$log2FoldChange > 1] <- "Up"
res_df$category[res_df$padj < 0.05 & res_df$log2FoldChange < -1] <- "Down"
ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), colour = category)) +
geom_point(alpha = 0.7, size = 1.5) +
scale_color_manual(values = c(
"Up" = "red",
"Down" = "blue",
"Not Sig" = "grey70"
)) +
theme_minimal() +
labs(
title = "Volcano Plot",
x = "log2 Fold Change",
y = "-log10(FDR)"
)
## Warning: Removed 16001 rows containing missing values or values outside the scale range
## (`geom_point()`).
plotMA(res, ylim = c(-5, 5))
geneList <- res$log2FoldChange
names(geneList) <- rownames(res)
geneList <- sort(geneList, decreasing = TRUE)
# To filter the gene IDs and convert to entrez ID
up_clean <- sub("\\..*", "", up_list)
down_clean <- sub("\\..*", "", down_list)
up_entrez <- bitr(up_clean,
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)$ENTREZID
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up_clean, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 20.73% of input gene IDs are fail to map...
down_entrez <- bitr(down_clean,
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)$ENTREZID
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down_clean, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 45.21% of input gene IDs are fail to map...
ego_up <- enrichGO(
gene = up_clean,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL", # or "SYMBOL"
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05
)
ego_down <- enrichGO(
gene = down_clean,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05
)
dotplot(ego_up, showCategory = 20)
dotplot(ego_down, showCategory = 20)
kegg_up <- enrichKEGG(
gene = up_entrez,
organism = "hsa",
pvalueCutoff = 0.05
)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
kegg_down <- enrichKEGG(
gene = down_entrez,
organism = "hsa",
pvalueCutoff = 0.05
)
dotplot(kegg_up)
dotplot(kegg_down)
react_up <- enrichPathway(
gene = up_entrez,
organism = "human",
pvalueCutoff = 0.05
)
react_down <- enrichPathway(
gene = down_entrez,
organism = "human",
pvalueCutoff = 0.05
)
dotplot(react_up)
dotplot(react_down)