Load the required packages

Load packages that we use

## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
## 
##     filter
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:ensembldb':
## 
##     filter, select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:GenomicRanges':
## 
##     reduce
## The following object is masked from 'package:IRanges':
## 
##     reduce
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:AnnotationFilter':
## 
##     not
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

Input data

The package RNAseqQC provide a dataset related to bulk RNA sequencing. So, we will use this dataset for our analysis.

count_mat <- counts(T47D)

Create metadata for assay

meta <- data.frame(colData(T47D))

After the data loading process, I would like to know: * How many samples are there?

ncol(count_mat)
## [1] 24

There are 24 samples

nrow(count_mat)
## [1] 43576

There are 43576 genes

boxplot(count_mat[,3][count_mat[,3] != 0])

Create a DESeq2 object

library(AnnotationHub)
## Loading required package: BiocFileCache
## Loading required package: dbplyr
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
## 
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:Biobase':
## 
##     cache
dds <- make_dds(counts = count_mat, metadata = meta, ah_record = "AH89426")
## There are 466 gene IDs that could not be annotated.

QC plots

Total count of reads among samples

plot_total_counts(dds)

Should I filter any gene or sample?

From the plot, we can see that with the read threshold increasing, the number of genes detected as expressed decrease –> different number of reads in genes between samples

plot_gene_detection(dds, threshold = c(0, 3, 5, 10, 20, 30, 50))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the RNAseqQC package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Is the replicates similar to each other?

dds <- filter_genes(dds, min_count = 5, min_rep = 4) # filter data 
vsd <- vst(dds) # normalize data 
colData(vsd)$trt_mut <- paste0(colData(vsd)$treatment, "_", colData(vsd)$mutation)
ma_plots <- plot_sample_MAs(vsd, group = "trt_mut")
cowplot::plot_grid(plotlist = ma_plots[21:24], ncol = 2)

What is the fraction of reads represent the fraction of genes?

plot_library_complexity(dds)

What are the biotypes of the genes in the dataset?

There are 5 main biotypes of genes in the dataset: protein-coding genes, lncRNA gene, pseudogenes, snRNA genes, MT RNA genes

plot_biotypes(dds)

Variance stablizing

library(vsn)
vsd <- vst(dds)  
mean_sd_plot(vsd)

Chromosomal expression

library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
map(c("1", "5", "14"), ~plot_chromosome(vsd, .x))
## [[1]]

## 
## [[2]]

## 
## [[3]]

Clustering

set.seed(1)
plot_sample_clustering(vsd, anno_vars = c("treatment", "mutation", "replicate"), distance = "euclidean")

PCA plot

plot_pca(vsd, PC_x = 1, PC_y = 2, color_by = "treatment", shape_by = "mutation")

Investigate the DESeq2 object

str(dds)
## Formal class 'DESeqDataSet' [package "DESeq2"] with 8 slots
##   ..@ design            :Class 'formula'  language ~1
##   ..@ dispersionFunction:function ()  
##   ..@ rowRanges         :Formal class 'CompressedGRangesList' [package "GenomicRanges"] with 5 slots
##   .. .. ..@ unlistData     :Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
##   .. .. .. .. ..@ seqnames       :Formal class 'Rle' [package "S4Vectors"] with 4 slots
##   .. .. .. .. .. .. ..@ values         : Factor w/ 0 levels: 
##   .. .. .. .. .. .. ..@ lengths        : int(0) 
##   .. .. .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. ..@ ranges         :Formal class 'IRanges' [package "IRanges"] with 6 slots
##   .. .. .. .. .. .. ..@ start          : int(0) 
##   .. .. .. .. .. .. ..@ width          : int(0) 
##   .. .. .. .. .. .. ..@ NAMES          : NULL
##   .. .. .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. ..@ strand         :Formal class 'Rle' [package "S4Vectors"] with 4 slots
##   .. .. .. .. .. .. ..@ values         : Factor w/ 3 levels "+","-","*": 
##   .. .. .. .. .. .. ..@ lengths        : int(0) 
##   .. .. .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. ..@ seqinfo        :Formal class 'Seqinfo' [package "GenomeInfoDb"] with 4 slots
##   .. .. .. .. .. .. ..@ seqnames   : chr(0) 
##   .. .. .. .. .. .. ..@ seqlengths : int(0) 
##   .. .. .. .. .. .. ..@ is_circular: logi(0) 
##   .. .. .. .. .. .. ..@ genome     : chr(0) 
##   .. .. .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   .. .. .. .. .. .. ..@ rownames       : NULL
##   .. .. .. .. .. .. ..@ nrows          : int 0
##   .. .. .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. .. .. ..@ listData       : Named list()
##   .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. ..@ metadata       : list()
##   .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   .. .. .. .. ..@ rownames       : NULL
##   .. .. .. .. ..@ nrows          : int 36562
##   .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   .. .. .. .. .. .. ..@ rownames       : NULL
##   .. .. .. .. .. .. ..@ nrows          : int 5
##   .. .. .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. .. .. ..@ listData       :List of 2
##   .. .. .. .. .. .. .. ..$ type       : chr [1:5] "input" "input" "input" "input" ...
##   .. .. .. .. .. .. .. ..$ description: chr [1:5] "" "" "" "" ...
##   .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. ..@ listData       :List of 5
##   .. .. .. .. .. ..$ gene_name   : chr [1:36562] "DDX11L1" "MIR6859-1" "WASH7P" "MIR1302-2" ...
##   .. .. .. .. .. ..$ gene_biotype: chr [1:36562] "transcribed_unprocessed_pseudogene" "miRNA" "unprocessed_pseudogene" "miRNA" ...
##   .. .. .. .. .. ..$ chromosome  : chr [1:36562] "1" "1" "1" "1" ...
##   .. .. .. .. .. ..$ chr_start   : int [1:36562] 11869 17369 14404 30366 29554 89551 89295 131025 135141 137682 ...
##   .. .. .. .. .. ..$ gc_content  : num [1:36562] 56.7 61.8 61.1 31.2 52.6 ...
##   .. .. ..@ elementType    : chr "GRanges"
##   .. .. ..@ metadata       : list()
##   .. .. ..@ partitioning   :Formal class 'PartitioningByEnd' [package "IRanges"] with 5 slots
##   .. .. .. .. ..@ end            : int [1:36562] 0 0 0 0 0 0 0 0 0 0 ...
##   .. .. .. .. ..@ NAMES          : chr [1:36562] "ENSG00000223972" "ENSG00000278267" "ENSG00000227232" "ENSG00000284332" ...
##   .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. ..@ metadata       : list()
##   ..@ colData           :Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   .. .. ..@ rownames       : chr [1:24] "veh_WT_rep1" "veh_WT_rep2" "veh_WT_rep3" "veh_WT_rep4" ...
##   .. .. ..@ nrows          : int 24
##   .. .. ..@ elementType    : chr "ANY"
##   .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   .. .. .. .. ..@ rownames       : NULL
##   .. .. .. .. ..@ nrows          : int 3
##   .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. ..@ metadata       : list()
##   .. .. .. .. ..@ listData       :List of 2
##   .. .. .. .. .. ..$ type       : chr [1:3] "input" "input" "input"
##   .. .. .. .. .. ..$ description: chr [1:3] "" "" ""
##   .. .. ..@ metadata       : list()
##   .. .. ..@ listData       :List of 3
##   .. .. .. ..$ treatment: chr [1:24] "veh" "veh" "veh" "veh" ...
##   .. .. .. ..$ mutation : chr [1:24] "WT" "WT" "WT" "WT" ...
##   .. .. .. ..$ replicate: chr [1:24] "rep1" "rep2" "rep3" "rep4" ...
##   ..@ assays            :Formal class 'SimpleAssays' [package "SummarizedExperiment"] with 1 slot
##   .. .. ..@ data:Formal class 'SimpleList' [package "S4Vectors"] with 4 slots
##   .. .. .. .. ..@ listData       :List of 1
##   .. .. .. .. .. ..$ counts: int [1:36562, 1:24] 772 290 11914 0 89 1040 1835 142 75 183 ...
##   .. .. .. .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. .. .. .. ..$ : chr [1:36562] "ENSG00000223972" "ENSG00000278267" "ENSG00000227232" "ENSG00000284332" ...
##   .. .. .. .. .. .. .. ..$ : chr [1:24] "veh_WT_rep1" "veh_WT_rep2" "veh_WT_rep3" "veh_WT_rep4" ...
##   .. .. .. .. ..@ elementType    : chr "ANY"
##   .. .. .. .. ..@ elementMetadata: NULL
##   .. .. .. .. ..@ metadata       : list()
##   ..@ NAMES             : NULL
##   ..@ elementMetadata   :Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   .. .. ..@ rownames       : NULL
##   .. .. ..@ nrows          : int 36562
##   .. .. ..@ elementType    : chr "ANY"
##   .. .. ..@ elementMetadata: NULL
##   .. .. ..@ metadata       : list()
##   .. .. ..@ listData       : Named list()
##   ..@ metadata          :List of 1
##   .. ..$ version:Classes 'package_version', 'numeric_version'  hidden list of 1
##   .. .. ..$ : int [1:3] 1 42 0

Differential testing

dds$treatment <- as.factor(dds$treatment)
dds$mutation <- as.factor(dds$mutation)
design(dds) <- ~ mutation + treatment
dds <- DESeq(dds, parallel = T)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 2 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 2 workers
plotDispEsts(dds)

Shrinkage estimation of log2 fold changes (lfc) between veh and E2 conditions

de_res <- lfcShrink(dds, coef = "treatment_veh_vs_E2", lfcThreshold = log2(2), type = "normal", parallel = TRUE)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
# MA plot
plot_ma(de_res, dds, annotate_top_n = 10)

de_res <- as.data.frame(de_res)
de_genes <- de_res[abs(de_res[,2]) >= 1,]
sig_genes <- de_genes[de_genes[,5] <= 0.05,]
upreg_res <- sig_genes[sig_genes[,2] >= 0,]
downreg_res <- sig_genes[sig_genes[,2] < 0,]

Gene ontology analysis

all_genes <- as.character(rownames(de_res))
upreg_genes <- as.character(rownames(upreg_res))
downreg_genes <- as.character(rownames(downreg_res))
library(clusterProfiler)
## 
## clusterProfiler v4.10.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:ensembldb':
## 
##     filter, select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Hs.eg.db)
## 
upreg_ego <- enrichGO(gene = upreg_genes,
                universe = all_genes,
                keyType = "ENSEMBL",
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                pAdjustMethod = "none",
                pvalueCutoff = 0.05,
                readable = TRUE)

cluster_summary <- data.frame(upreg_ego)
dotplot(upreg_ego, showCategory=7)

cnetplot(upreg_ego,
        categorySize="pvalue",
        showCategory = 5,
        foldChange= list(foldChange = upreg_res$log2FoldChange),
        vertex.label.font=3)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

downreg_ego <- enrichGO(gene = downreg_genes,
                universe = all_genes,
                keyType = "ENSEMBL",
                OrgDb = org.Hs.eg.db,
                ont = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff = 0.05,
                readable = TRUE)
cluster_summary_downreg <- data.frame(downreg_ego)
dotplot(downreg_ego, showCategory = 7)

cnetplot(downreg_ego,
        categorySize="pvalue",
        showCategory = 5,
        foldChange= list(foldChange = downreg_res$log2FoldChange),
        vertex.label.font=3)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Warning in FUN(X[[i]], ...): NAs introduced by coercion