DGEA-for-BFB

This code has been borrowed from http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html for demonstration purposes.

Installing required packages:

source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install(c("DESeq2", "pasilla", "apeglm", "EnhancedVolcano"))

installed.packages('readr')

install.packages("pheatmap")

Alternatively, you can install some packages as follows:

BiocManager::install("edgeR")

Loading the packages:

library("DESeq2")

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library("pasilla")
library("readr")

Load sample and gene expression data:

This data set is from an experiment on Drosophila melanogaster cell cultures and investigated the effect of RNAi knock-down of the splicing factor pasilla (Brooks et al. 2011)

library("pasilla")
pasCts <- system.file("extdata",
                      "pasilla_gene_counts.tsv",
                      package="pasilla", mustWork=TRUE)

pasAnno <- system.file("extdata",
                       "pasilla_sample_annotation.csv",
                       package="pasilla", mustWork=TRUE)

cts <- as.matrix(read.csv(pasCts, sep="\t", row.names="gene_id"))

head(cts)

##             untreated1 untreated2 untreated3 untreated4 treated1 treated2
## FBgn0000003          0          0          0          0        0        0
## FBgn0000008         92        161         76         70      140       88
## FBgn0000014          5          1          0          0        4        0
## FBgn0000015          0          2          1          2        1        0
## FBgn0000017       4664       8714       3564       3150     6205     3072
## FBgn0000018        583        761        245        310      722      299
##             treated3
## FBgn0000003        1
## FBgn0000008       70
## FBgn0000014        0
## FBgn0000015        0
## FBgn0000017     3334
## FBgn0000018      308

coldata <- read.csv(pasAnno, row.names=1)
coldata

coldata <- coldata[,c("condition","type")]
coldata$condition <- factor(coldata$condition)
coldata$type <- factor(coldata$type)

Check the sample names and colData entries

rownames(coldata) <- sub("fb", "", rownames(coldata))
all(rownames(coldata) %in% colnames(cts))

## [1] TRUE

all(rownames(coldata) == colnames(cts))

## [1] FALSE

Reorder the samples in the count data matrix based on the order in the sample data.

cts <- cts[, rownames(coldata)]
all(rownames(coldata) == colnames(cts))

## [1] TRUE

Construct a DESeqDataSet from the txi object and sample information in samples

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ condition)
dds

## class: DESeqDataSet 
## dim: 14599 7 
## metadata(1): version
## assays(1): counts
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574 FBgn0261575
## rowData names(0):
## colnames(7): treated1 treated2 ... untreated3 untreated4
## colData names(2): condition type

(A minimal) Gene pre-filtering Note: More strict filtering to increase power is automatically applied when using DESeq2.

keep <- rowSums(counts(dds)) >= 10 #Keep genes that has more than 10 counts in total across all samples
dds <- dds[keep, ]

Define the comparison conditions

dds$condition <- factor(dds$condition, levels = c("untreated","treated"))

Run Differential Gene Expression Analysis

dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res <- results(dds)
res

## log2 fold change (MLE): condition treated vs untreated 
## Wald test p-value: condition treated vs untreated 
## DataFrame with 9921 rows and 6 columns
##               baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##              <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## FBgn0000008   95.14429     0.00227644  0.223729   0.010175 0.9918817  0.997211
## FBgn0000014    1.05652    -0.49512039  2.143186  -0.231021 0.8172987        NA
## FBgn0000017 4352.55357    -0.23991894  0.126337  -1.899041 0.0575591  0.288002
## FBgn0000018  418.61048    -0.10467391  0.148489  -0.704927 0.4808558  0.826834
## FBgn0000024    6.40620     0.21084779  0.689588   0.305759 0.7597879  0.943501
## ...                ...            ...       ...        ...       ...       ...
## FBgn0261570 3208.38861      0.2955329  0.127350  2.3206264  0.020307  0.144240
## FBgn0261572    6.19719     -0.9588230  0.775315 -1.2366888  0.216203  0.607848
## FBgn0261573 2240.97951      0.0127194  0.113300  0.1122634  0.910615  0.982657
## FBgn0261574 4857.68037      0.0153924  0.192567  0.0799327  0.936291  0.988179
## FBgn0261575   10.68252      0.1635705  0.930911  0.1757102  0.860522  0.967928

Order the results table based on low p-value

resOrdered <- res[order(res$pvalue),]
resOrdered

## log2 fold change (MLE): condition treated vs untreated 
## Wald test p-value: condition treated vs untreated 
## DataFrame with 9921 rows and 6 columns
##              baseMean log2FoldChange     lfcSE         stat       pvalue
##             <numeric>      <numeric> <numeric>    <numeric>    <numeric>
## FBgn0039155   730.568       -4.61874 0.1691240     -27.3098 3.24447e-164
## FBgn0025111  1501.448        2.89995 0.1273576      22.7701 9.07164e-115
## FBgn0029167  3706.024       -2.19691 0.0979154     -22.4368 1.72030e-111
## FBgn0003360  4342.832       -3.17954 0.1435677     -22.1466 1.12417e-108
## FBgn0035085   638.219       -2.56024 0.1378126     -18.5777  4.86845e-77
## ...               ...            ...       ...          ...          ...
## FBgn0051913   2.75973   -0.001450714  1.239864 -0.001170059     0.999066
## FBgn0037061 144.76833    0.000314936  0.290635  0.001083616     0.999135
## FBgn0036689 117.56024   -0.000228957  0.214822 -0.001065801     0.999150
## FBgn0046272   6.54031   -0.001980640  2.439802 -0.000811803     0.999352
## FBgn0030880  13.00577   -6.969572683  3.681447 -1.893161117           NA
##                     padj
##                <numeric>
## FBgn0039155 2.71919e-160
## FBgn0025111 3.80147e-111
## FBgn0029167 4.80595e-108
## FBgn0003360 2.35542e-105
## FBgn0035085  8.16049e-74
## ...                  ...
## FBgn0051913           NA
## FBgn0037061     0.999269
## FBgn0036689     0.999269
## FBgn0046272     0.999352
## FBgn0030880           NA

Summary of results

summary(res)

## 
## out of 9921 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 518, 5.2%
## LFC < 0 (down)     : 536, 5.4%
## outliers [1]       : 1, 0.01%
## low counts [2]     : 1539, 16%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

sum(res$padj < 0.1, na.rm=TRUE)

## [1] 1054

Can change the alpha threshold

res05 <- results(dds, alpha=0.05)
summary(res05)

## 
## out of 9921 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 407, 4.1%
## LFC < 0 (down)     : 431, 4.3%
## outliers [1]       : 1, 0.01%
## low counts [2]     : 1347, 14%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Exploratory plots - MA plot

plotMA(res, ylim=c(-2,2))

MA plot with shrunken log2 FC values which removes noise from low expressed genes

resultsNames(dds)

## [1] "Intercept"                      "condition_treated_vs_untreated"

resLFC <- lfcShrink(dds, coef="condition_treated_vs_untreated", type="apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

resLFC

## log2 fold change (MAP): condition treated vs untreated 
## Wald test p-value: condition treated vs untreated 
## DataFrame with 9921 rows and 5 columns
##               baseMean log2FoldChange     lfcSE    pvalue      padj
##              <numeric>      <numeric> <numeric> <numeric> <numeric>
## FBgn0000008   95.14429     0.00119920  0.151897 0.9918817  0.997211
## FBgn0000014    1.05652    -0.00473412  0.205468 0.8172987        NA
## FBgn0000017 4352.55357    -0.18989990  0.120377 0.0575591  0.288002
## FBgn0000018  418.61048    -0.06995753  0.123901 0.4808558  0.826834
## FBgn0000024    6.40620     0.01752715  0.198633 0.7597879  0.943501
## ...                ...            ...       ...       ...       ...
## FBgn0261570 3208.38861     0.24110290 0.1244469  0.020307  0.144240
## FBgn0261572    6.19719    -0.06576173 0.2141351  0.216203  0.607848
## FBgn0261573 2240.97951     0.01000619 0.0993764  0.910615  0.982657
## FBgn0261574 4857.68037     0.00843552 0.1408267  0.936291  0.988179
## FBgn0261575   10.68252     0.00809101 0.2014704  0.860522  0.967928

plotMA(resLFC, ylim=c(-2,2))

Ploting the counts for the gene with the smallest P-value

d <- plotCounts(dds, gene=which.min(res$padj),
                intgroup="condition",
                returnData=TRUE)

library("ggplot2")

ggplot(d, aes(x=condition, y=count, color=condition)) +
  geom_point(position=position_jitter(w=0.1,h=0), size=4) +
  scale_y_log10(breaks=c(25,100,400))

Volcano plot

library(EnhancedVolcano)

## Loading required package: ggrepel

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

EnhancedVolcano(res,
                labSize = 2,
                lab = rownames(res),
                x = 'log2FoldChange',
                y = 'pvalue')

## Warning: Ignoring unknown parameters: xlim, ylim

Clustering

Normalize the expression data for library size using cpm function from edgeR package

library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

cpm_pas <- edgeR::cpm(cts)

K-means clustering with pasilla data

pas_k2 <- kmeans(t(cpm_pas), centers = 2, iter.max = 100)
pas_k2$cluster

##   treated1   treated2   treated3 untreated1 untreated2 untreated3 untreated4 
##          1          1          1          2          2          2          2

Hierarchical Clustering

Calculate the distances between each pair of samples using euclidean method

d_pas <- dist(t(cpm_pas), method = 'euclidean')

Create a hierarchical clustering based on the distances calculated above using hclust function from base R

hc_pas <- hclust(d_pas, method = "complete")

Create a dendogram

dend <- as.dendrogram(hc_pas)
dend <- color_branches(dend) 

dend <- hang.dendrogram(dend)
plot(dend, 
     main = "Clustered Pasilla data set",  nodePar = list(cex = .7))

Select the genes that significantly differentially expressed between treated and untreated samples

#Heatmap for DE genes
res <- results(dds)
DEGs <- res[which(res$padj < 0.05 & abs(res$log2FoldChange) > 1.0 ),]

Plot a heatmap showing the expressions of DEGs using pheatmap function

pheatmap::pheatmap(as.matrix(cpm_pas[rownames(DEGs),]), scale = "row")

sessionInfo()

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dendextend_1.15.2           EnhancedVolcano_1.10.0     
##  [3] ggrepel_0.9.1               ggplot2_3.3.5              
##  [5] readr_1.4.0                 pasilla_1.20.0             
##  [7] DESeq2_1.32.0               SummarizedExperiment_1.22.0
##  [9] Biobase_2.52.0              MatrixGenerics_1.4.0       
## [11] matrixStats_0.60.0          GenomicRanges_1.44.0       
## [13] GenomeInfoDb_1.28.1         IRanges_2.26.0             
## [15] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## 
## loaded via a namespace (and not attached):
##   [1] ggbeeswarm_0.6.0       colorspace_2.0-2       ellipsis_0.3.2        
##   [4] XVector_0.32.0         farver_2.1.0           bit64_4.0.5           
##   [7] AnnotationDbi_1.54.1   fansi_0.5.0            mvtnorm_1.1-2         
##  [10] apeglm_1.14.0          splines_4.1.0          extrafont_0.17        
##  [13] cachem_1.0.6           geneplotter_1.70.0     knitr_1.33            
##  [16] jsonlite_1.7.2         Rttf2pt1_1.3.9         annotate_1.70.0       
##  [19] png_0.1-7              pheatmap_1.0.12        compiler_4.1.0        
##  [22] httr_1.4.2             assertthat_0.2.1       Matrix_1.3-3          
##  [25] fastmap_1.1.0          limma_3.48.3           htmltools_0.5.2       
##  [28] tools_4.1.0            coda_0.19-4            gtable_0.3.0          
##  [31] glue_1.4.2             GenomeInfoDbData_1.2.6 dplyr_1.0.7           
##  [34] maps_3.4.0             Rcpp_1.0.7             bbmle_1.0.24          
##  [37] jquerylib_0.1.4        vctrs_0.3.8            Biostrings_2.60.1     
##  [40] ggalt_0.4.0            extrafontdb_1.0        xfun_0.24             
##  [43] stringr_1.4.0          lifecycle_1.0.0        XML_3.99-0.6          
##  [46] edgeR_3.34.1           zlibbioc_1.38.0        MASS_7.3-54           
##  [49] scales_1.1.1           hms_1.1.0              proj4_1.0-10.1        
##  [52] RColorBrewer_1.1-2     yaml_2.2.1             gridExtra_2.3         
##  [55] memoise_2.0.0          ggrastr_1.0.1          emdbook_1.3.12        
##  [58] sass_0.4.0             bdsmatrix_1.3-4        stringi_1.6.2         
##  [61] RSQLite_2.2.8          highr_0.9              genefilter_1.74.1     
##  [64] BiocParallel_1.26.1    rlang_0.4.11           pkgconfig_2.0.3       
##  [67] bitops_1.0-7           evaluate_0.14          lattice_0.20-44       
##  [70] purrr_0.3.4            labeling_0.4.2         bit_4.0.4             
##  [73] tidyselect_1.1.1       plyr_1.8.6             magrittr_2.0.1        
##  [76] R6_2.5.0               generics_0.1.0         DelayedArray_0.18.0   
##  [79] DBI_1.1.1              pillar_1.6.1           withr_2.4.2           
##  [82] survival_3.2-11        KEGGREST_1.32.0        RCurl_1.98-1.3        
##  [85] ash_1.0-15             tibble_3.1.2           crayon_1.4.1          
##  [88] KernSmooth_2.23-20     utf8_1.2.1             rmarkdown_2.9         
##  [91] viridis_0.6.1          locfit_1.5-9.4         grid_4.1.0            
##  [94] blob_1.2.1             digest_0.6.27          xtable_1.8-4          
##  [97] numDeriv_2016.8-1.1    munsell_0.5.0          viridisLite_0.4.0     
## [100] beeswarm_0.4.0         vipor_0.4.5            bslib_0.3.0