This code has been borrowed from http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html for demonstration purposes.
Installing required packages:
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("DESeq2", "pasilla", "apeglm", "EnhancedVolcano"))
installed.packages('readr')
install.packages("pheatmap")
Alternatively, you can install some packages as follows:
BiocManager::install("edgeR")
Loading the packages:
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library("pasilla")
library("readr")
This data set is from an experiment on Drosophila melanogaster cell cultures and investigated the effect of RNAi knock-down of the splicing factor pasilla (Brooks et al. 2011)
library("pasilla")
pasCts <- system.file("extdata",
"pasilla_gene_counts.tsv",
package="pasilla", mustWork=TRUE)
pasAnno <- system.file("extdata",
"pasilla_sample_annotation.csv",
package="pasilla", mustWork=TRUE)
cts <- as.matrix(read.csv(pasCts, sep="\t", row.names="gene_id"))
head(cts)
## untreated1 untreated2 untreated3 untreated4 treated1 treated2
## FBgn0000003 0 0 0 0 0 0
## FBgn0000008 92 161 76 70 140 88
## FBgn0000014 5 1 0 0 4 0
## FBgn0000015 0 2 1 2 1 0
## FBgn0000017 4664 8714 3564 3150 6205 3072
## FBgn0000018 583 761 245 310 722 299
## treated3
## FBgn0000003 1
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 0
## FBgn0000017 3334
## FBgn0000018 308
coldata <- read.csv(pasAnno, row.names=1)
coldata
coldata <- coldata[,c("condition","type")]
coldata$condition <- factor(coldata$condition)
coldata$type <- factor(coldata$type)
rownames(coldata) <- sub("fb", "", rownames(coldata))
all(rownames(coldata) %in% colnames(cts))
## [1] TRUE
all(rownames(coldata) == colnames(cts))
## [1] FALSE
cts <- cts[, rownames(coldata)]
all(rownames(coldata) == colnames(cts))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ condition)
dds
## class: DESeqDataSet
## dim: 14599 7
## metadata(1): version
## assays(1): counts
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574 FBgn0261575
## rowData names(0):
## colnames(7): treated1 treated2 ... untreated3 untreated4
## colData names(2): condition type
(A minimal) Gene pre-filtering Note: More strict filtering to increase power is automatically applied when using DESeq2.
keep <- rowSums(counts(dds)) >= 10 #Keep genes that has more than 10 counts in total across all samples
dds <- dds[keep, ]
Define the comparison conditions
dds$condition <- factor(dds$condition, levels = c("untreated","treated"))
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
res
## log2 fold change (MLE): condition treated vs untreated
## Wald test p-value: condition treated vs untreated
## DataFrame with 9921 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0000008 95.14429 0.00227644 0.223729 0.010175 0.9918817 0.997211
## FBgn0000014 1.05652 -0.49512039 2.143186 -0.231021 0.8172987 NA
## FBgn0000017 4352.55357 -0.23991894 0.126337 -1.899041 0.0575591 0.288002
## FBgn0000018 418.61048 -0.10467391 0.148489 -0.704927 0.4808558 0.826834
## FBgn0000024 6.40620 0.21084779 0.689588 0.305759 0.7597879 0.943501
## ... ... ... ... ... ... ...
## FBgn0261570 3208.38861 0.2955329 0.127350 2.3206264 0.020307 0.144240
## FBgn0261572 6.19719 -0.9588230 0.775315 -1.2366888 0.216203 0.607848
## FBgn0261573 2240.97951 0.0127194 0.113300 0.1122634 0.910615 0.982657
## FBgn0261574 4857.68037 0.0153924 0.192567 0.0799327 0.936291 0.988179
## FBgn0261575 10.68252 0.1635705 0.930911 0.1757102 0.860522 0.967928
Order the results table based on low p-value
resOrdered <- res[order(res$pvalue),]
resOrdered
## log2 fold change (MLE): condition treated vs untreated
## Wald test p-value: condition treated vs untreated
## DataFrame with 9921 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0039155 730.568 -4.61874 0.1691240 -27.3098 3.24447e-164
## FBgn0025111 1501.448 2.89995 0.1273576 22.7701 9.07164e-115
## FBgn0029167 3706.024 -2.19691 0.0979154 -22.4368 1.72030e-111
## FBgn0003360 4342.832 -3.17954 0.1435677 -22.1466 1.12417e-108
## FBgn0035085 638.219 -2.56024 0.1378126 -18.5777 4.86845e-77
## ... ... ... ... ... ...
## FBgn0051913 2.75973 -0.001450714 1.239864 -0.001170059 0.999066
## FBgn0037061 144.76833 0.000314936 0.290635 0.001083616 0.999135
## FBgn0036689 117.56024 -0.000228957 0.214822 -0.001065801 0.999150
## FBgn0046272 6.54031 -0.001980640 2.439802 -0.000811803 0.999352
## FBgn0030880 13.00577 -6.969572683 3.681447 -1.893161117 NA
## padj
## <numeric>
## FBgn0039155 2.71919e-160
## FBgn0025111 3.80147e-111
## FBgn0029167 4.80595e-108
## FBgn0003360 2.35542e-105
## FBgn0035085 8.16049e-74
## ... ...
## FBgn0051913 NA
## FBgn0037061 0.999269
## FBgn0036689 0.999269
## FBgn0046272 0.999352
## FBgn0030880 NA
Summary of results
summary(res)
##
## out of 9921 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 518, 5.2%
## LFC < 0 (down) : 536, 5.4%
## outliers [1] : 1, 0.01%
## low counts [2] : 1539, 16%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 1054
Can change the alpha threshold
res05 <- results(dds, alpha=0.05)
summary(res05)
##
## out of 9921 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 407, 4.1%
## LFC < 0 (down) : 431, 4.3%
## outliers [1] : 1, 0.01%
## low counts [2] : 1347, 14%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
plotMA(res, ylim=c(-2,2))
MA plot with shrunken log2 FC values which removes noise from low expressed genes
resultsNames(dds)
## [1] "Intercept" "condition_treated_vs_untreated"
resLFC <- lfcShrink(dds, coef="condition_treated_vs_untreated", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC
## log2 fold change (MAP): condition treated vs untreated
## Wald test p-value: condition treated vs untreated
## DataFrame with 9921 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0000008 95.14429 0.00119920 0.151897 0.9918817 0.997211
## FBgn0000014 1.05652 -0.00473412 0.205468 0.8172987 NA
## FBgn0000017 4352.55357 -0.18989990 0.120377 0.0575591 0.288002
## FBgn0000018 418.61048 -0.06995753 0.123901 0.4808558 0.826834
## FBgn0000024 6.40620 0.01752715 0.198633 0.7597879 0.943501
## ... ... ... ... ... ...
## FBgn0261570 3208.38861 0.24110290 0.1244469 0.020307 0.144240
## FBgn0261572 6.19719 -0.06576173 0.2141351 0.216203 0.607848
## FBgn0261573 2240.97951 0.01000619 0.0993764 0.910615 0.982657
## FBgn0261574 4857.68037 0.00843552 0.1408267 0.936291 0.988179
## FBgn0261575 10.68252 0.00809101 0.2014704 0.860522 0.967928
plotMA(resLFC, ylim=c(-2,2))
d <- plotCounts(dds, gene=which.min(res$padj),
intgroup="condition",
returnData=TRUE)
library("ggplot2")
ggplot(d, aes(x=condition, y=count, color=condition)) +
geom_point(position=position_jitter(w=0.1,h=0), size=4) +
scale_y_log10(breaks=c(25,100,400))
library(EnhancedVolcano)
## Loading required package: ggrepel
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
EnhancedVolcano(res,
labSize = 2,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue')
## Warning: Ignoring unknown parameters: xlim, ylim
Normalize the expression data for library size using cpm function from edgeR package
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
cpm_pas <- edgeR::cpm(cts)
pas_k2 <- kmeans(t(cpm_pas), centers = 2, iter.max = 100)
pas_k2$cluster
## treated1 treated2 treated3 untreated1 untreated2 untreated3 untreated4
## 1 1 1 2 2 2 2
Calculate the distances between each pair of samples using euclidean method
d_pas <- dist(t(cpm_pas), method = 'euclidean')
Create a hierarchical clustering based on the distances calculated above using hclust function from base R
hc_pas <- hclust(d_pas, method = "complete")
Create a dendogram
dend <- as.dendrogram(hc_pas)
dend <- color_branches(dend)
dend <- hang.dendrogram(dend)
plot(dend,
main = "Clustered Pasilla data set", nodePar = list(cex = .7))
Select the genes that significantly differentially expressed between treated and untreated samples
#Heatmap for DE genes
res <- results(dds)
DEGs <- res[which(res$padj < 0.05 & abs(res$log2FoldChange) > 1.0 ),]
Plot a heatmap showing the expressions of DEGs using pheatmap function
pheatmap::pheatmap(as.matrix(cpm_pas[rownames(DEGs),]), scale = "row")
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dendextend_1.15.2 EnhancedVolcano_1.10.0
## [3] ggrepel_0.9.1 ggplot2_3.3.5
## [5] readr_1.4.0 pasilla_1.20.0
## [7] DESeq2_1.32.0 SummarizedExperiment_1.22.0
## [9] Biobase_2.52.0 MatrixGenerics_1.4.0
## [11] matrixStats_0.60.0 GenomicRanges_1.44.0
## [13] GenomeInfoDb_1.28.1 IRanges_2.26.0
## [15] S4Vectors_0.30.0 BiocGenerics_0.38.0
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.6.0 colorspace_2.0-2 ellipsis_0.3.2
## [4] XVector_0.32.0 farver_2.1.0 bit64_4.0.5
## [7] AnnotationDbi_1.54.1 fansi_0.5.0 mvtnorm_1.1-2
## [10] apeglm_1.14.0 splines_4.1.0 extrafont_0.17
## [13] cachem_1.0.6 geneplotter_1.70.0 knitr_1.33
## [16] jsonlite_1.7.2 Rttf2pt1_1.3.9 annotate_1.70.0
## [19] png_0.1-7 pheatmap_1.0.12 compiler_4.1.0
## [22] httr_1.4.2 assertthat_0.2.1 Matrix_1.3-3
## [25] fastmap_1.1.0 limma_3.48.3 htmltools_0.5.2
## [28] tools_4.1.0 coda_0.19-4 gtable_0.3.0
## [31] glue_1.4.2 GenomeInfoDbData_1.2.6 dplyr_1.0.7
## [34] maps_3.4.0 Rcpp_1.0.7 bbmle_1.0.24
## [37] jquerylib_0.1.4 vctrs_0.3.8 Biostrings_2.60.1
## [40] ggalt_0.4.0 extrafontdb_1.0 xfun_0.24
## [43] stringr_1.4.0 lifecycle_1.0.0 XML_3.99-0.6
## [46] edgeR_3.34.1 zlibbioc_1.38.0 MASS_7.3-54
## [49] scales_1.1.1 hms_1.1.0 proj4_1.0-10.1
## [52] RColorBrewer_1.1-2 yaml_2.2.1 gridExtra_2.3
## [55] memoise_2.0.0 ggrastr_1.0.1 emdbook_1.3.12
## [58] sass_0.4.0 bdsmatrix_1.3-4 stringi_1.6.2
## [61] RSQLite_2.2.8 highr_0.9 genefilter_1.74.1
## [64] BiocParallel_1.26.1 rlang_0.4.11 pkgconfig_2.0.3
## [67] bitops_1.0-7 evaluate_0.14 lattice_0.20-44
## [70] purrr_0.3.4 labeling_0.4.2 bit_4.0.4
## [73] tidyselect_1.1.1 plyr_1.8.6 magrittr_2.0.1
## [76] R6_2.5.0 generics_0.1.0 DelayedArray_0.18.0
## [79] DBI_1.1.1 pillar_1.6.1 withr_2.4.2
## [82] survival_3.2-11 KEGGREST_1.32.0 RCurl_1.98-1.3
## [85] ash_1.0-15 tibble_3.1.2 crayon_1.4.1
## [88] KernSmooth_2.23-20 utf8_1.2.1 rmarkdown_2.9
## [91] viridis_0.6.1 locfit_1.5-9.4 grid_4.1.0
## [94] blob_1.2.1 digest_0.6.27 xtable_1.8-4
## [97] numDeriv_2016.8-1.1 munsell_0.5.0 viridisLite_0.4.0
## [100] beeswarm_0.4.0 vipor_0.4.5 bslib_0.3.0