Load packages that we use
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
##
## filter
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:ensembldb':
##
## filter, select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'purrr'
## The following object is masked from 'package:GenomicRanges':
##
## reduce
## The following object is masked from 'package:IRanges':
##
## reduce
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:AnnotationFilter':
##
## not
## The following object is masked from 'package:GenomicRanges':
##
## subtract
The package RNAseqQC provide a dataset related to bulk RNA sequencing. So, we will use this dataset for our analysis.
count_mat <- counts(T47D)
Create metadata for assay
meta <- data.frame(colData(T47D))
After the data loading process, I would like to know: * How many samples are there?
ncol(count_mat)
## [1] 24
There are 24 samples
nrow(count_mat)
## [1] 43576
There are 43576 genes
boxplot(count_mat[,3][count_mat[,3] != 0])
library(AnnotationHub)
## Loading required package: BiocFileCache
## Loading required package: dbplyr
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
##
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:Biobase':
##
## cache
dds <- make_dds(counts = count_mat, metadata = meta, ah_record = "AH89426")
## There are 466 gene IDs that could not be annotated.
plot_total_counts(dds)
From the plot, we can see that with the read threshold increasing, the number of genes detected as expressed decrease –> different number of reads in genes between samples
plot_gene_detection(dds, threshold = c(0, 3, 5, 10, 20, 30, 50))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the RNAseqQC package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
dds <- filter_genes(dds, min_count = 5, min_rep = 4) # filter data
vsd <- vst(dds) # normalize data
colData(vsd)$trt_mut <- paste0(colData(vsd)$treatment, "_", colData(vsd)$mutation)
ma_plots <- plot_sample_MAs(vsd, group = "trt_mut")
cowplot::plot_grid(plotlist = ma_plots[21:24], ncol = 2)
plot_library_complexity(dds)
There are 5 main biotypes of genes in the dataset: protein-coding genes, lncRNA gene, pseudogenes, snRNA genes, MT RNA genes
plot_biotypes(dds)
library(vsn)
vsd <- vst(dds)
mean_sd_plot(vsd)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
map(c("1", "5", "14"), ~plot_chromosome(vsd, .x))
## [[1]]
##
## [[2]]
##
## [[3]]
set.seed(1)
plot_sample_clustering(vsd, anno_vars = c("treatment", "mutation", "replicate"), distance = "euclidean")
plot_pca(vsd, PC_x = 1, PC_y = 2, color_by = "treatment", shape_by = "mutation")
str(dds)
## Formal class 'DESeqDataSet' [package "DESeq2"] with 8 slots
## ..@ design :Class 'formula' language ~1
## ..@ dispersionFunction:function ()
## ..@ rowRanges :Formal class 'CompressedGRangesList' [package "GenomicRanges"] with 5 slots
## .. .. ..@ unlistData :Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
## .. .. .. .. ..@ seqnames :Formal class 'Rle' [package "S4Vectors"] with 4 slots
## .. .. .. .. .. .. ..@ values : Factor w/ 0 levels:
## .. .. .. .. .. .. ..@ lengths : int(0)
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ ranges :Formal class 'IRanges' [package "IRanges"] with 6 slots
## .. .. .. .. .. .. ..@ start : int(0)
## .. .. .. .. .. .. ..@ width : int(0)
## .. .. .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ strand :Formal class 'Rle' [package "S4Vectors"] with 4 slots
## .. .. .. .. .. .. ..@ values : Factor w/ 3 levels "+","-","*":
## .. .. .. .. .. .. ..@ lengths : int(0)
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ seqinfo :Formal class 'Seqinfo' [package "GenomeInfoDb"] with 4 slots
## .. .. .. .. .. .. ..@ seqnames : chr(0)
## .. .. .. .. .. .. ..@ seqlengths : int(0)
## .. .. .. .. .. .. ..@ is_circular: logi(0)
## .. .. .. .. .. .. ..@ genome : chr(0)
## .. .. .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. .. .. .. .. .. ..@ rownames : NULL
## .. .. .. .. .. .. ..@ nrows : int 0
## .. .. .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. .. .. ..@ listData : Named list()
## .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. ..@ metadata : list()
## .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. .. .. .. ..@ rownames : NULL
## .. .. .. .. ..@ nrows : int 36562
## .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. .. .. .. .. .. ..@ rownames : NULL
## .. .. .. .. .. .. ..@ nrows : int 5
## .. .. .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. .. .. ..@ listData :List of 2
## .. .. .. .. .. .. .. ..$ type : chr [1:5] "input" "input" "input" "input" ...
## .. .. .. .. .. .. .. ..$ description: chr [1:5] "" "" "" "" ...
## .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ listData :List of 5
## .. .. .. .. .. ..$ gene_name : chr [1:36562] "DDX11L1" "MIR6859-1" "WASH7P" "MIR1302-2" ...
## .. .. .. .. .. ..$ gene_biotype: chr [1:36562] "transcribed_unprocessed_pseudogene" "miRNA" "unprocessed_pseudogene" "miRNA" ...
## .. .. .. .. .. ..$ chromosome : chr [1:36562] "1" "1" "1" "1" ...
## .. .. .. .. .. ..$ chr_start : int [1:36562] 11869 17369 14404 30366 29554 89551 89295 131025 135141 137682 ...
## .. .. .. .. .. ..$ gc_content : num [1:36562] 56.7 61.8 61.1 31.2 52.6 ...
## .. .. ..@ elementType : chr "GRanges"
## .. .. ..@ metadata : list()
## .. .. ..@ partitioning :Formal class 'PartitioningByEnd' [package "IRanges"] with 5 slots
## .. .. .. .. ..@ end : int [1:36562] 0 0 0 0 0 0 0 0 0 0 ...
## .. .. .. .. ..@ NAMES : chr [1:36562] "ENSG00000223972" "ENSG00000278267" "ENSG00000227232" "ENSG00000284332" ...
## .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## ..@ colData :Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. .. ..@ rownames : chr [1:24] "veh_WT_rep1" "veh_WT_rep2" "veh_WT_rep3" "veh_WT_rep4" ...
## .. .. ..@ nrows : int 24
## .. .. ..@ elementType : chr "ANY"
## .. .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. .. .. .. ..@ rownames : NULL
## .. .. .. .. ..@ nrows : int 3
## .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ listData :List of 2
## .. .. .. .. .. ..$ type : chr [1:3] "input" "input" "input"
## .. .. .. .. .. ..$ description: chr [1:3] "" "" ""
## .. .. ..@ metadata : list()
## .. .. ..@ listData :List of 3
## .. .. .. ..$ treatment: chr [1:24] "veh" "veh" "veh" "veh" ...
## .. .. .. ..$ mutation : chr [1:24] "WT" "WT" "WT" "WT" ...
## .. .. .. ..$ replicate: chr [1:24] "rep1" "rep2" "rep3" "rep4" ...
## ..@ assays :Formal class 'SimpleAssays' [package "SummarizedExperiment"] with 1 slot
## .. .. ..@ data:Formal class 'SimpleList' [package "S4Vectors"] with 4 slots
## .. .. .. .. ..@ listData :List of 1
## .. .. .. .. .. ..$ counts: int [1:36562, 1:24] 772 290 11914 0 89 1040 1835 142 75 183 ...
## .. .. .. .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. .. .. .. ..$ : chr [1:36562] "ENSG00000223972" "ENSG00000278267" "ENSG00000227232" "ENSG00000284332" ...
## .. .. .. .. .. .. .. ..$ : chr [1:24] "veh_WT_rep1" "veh_WT_rep2" "veh_WT_rep3" "veh_WT_rep4" ...
## .. .. .. .. ..@ elementType : chr "ANY"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## ..@ NAMES : NULL
## ..@ elementMetadata :Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. .. ..@ rownames : NULL
## .. .. ..@ nrows : int 36562
## .. .. ..@ elementType : chr "ANY"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## .. .. ..@ listData : Named list()
## ..@ metadata :List of 1
## .. ..$ version:Classes 'package_version', 'numeric_version' hidden list of 1
## .. .. ..$ : int [1:3] 1 42 0
dds$treatment <- as.factor(dds$treatment)
dds$mutation <- as.factor(dds$mutation)
design(dds) <- ~ mutation + treatment
dds <- DESeq(dds, parallel = T)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 2 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 2 workers
plotDispEsts(dds)
de_res <- lfcShrink(dds, coef = "treatment_veh_vs_E2", lfcThreshold = log2(2), type = "normal", parallel = TRUE)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
# MA plot
plot_ma(de_res, dds, annotate_top_n = 10)
de_res <- as.data.frame(de_res)
de_genes <- de_res[abs(de_res[,2]) >= 1,]
sig_genes <- de_genes[de_genes[,5] <= 0.05,]
upreg_res <- sig_genes[sig_genes[,2] >= 0,]
downreg_res <- sig_genes[sig_genes[,2] < 0,]
all_genes <- as.character(rownames(de_res))
upreg_genes <- as.character(rownames(upreg_res))
downreg_genes <- as.character(rownames(downreg_res))
library(clusterProfiler)
##
## clusterProfiler v4.10.0 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:ensembldb':
##
## filter, select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library(org.Hs.eg.db)
##
upreg_ego <- enrichGO(gene = upreg_genes,
universe = all_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "none",
pvalueCutoff = 0.05,
readable = TRUE)
cluster_summary <- data.frame(upreg_ego)
dotplot(upreg_ego, showCategory=7)
cnetplot(upreg_ego,
categorySize="pvalue",
showCategory = 5,
foldChange= list(foldChange = upreg_res$log2FoldChange),
vertex.label.font=3)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
downreg_ego <- enrichGO(gene = downreg_genes,
universe = all_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
cluster_summary_downreg <- data.frame(downreg_ego)
dotplot(downreg_ego, showCategory = 7)
cnetplot(downreg_ego,
categorySize="pvalue",
showCategory = 5,
foldChange= list(foldChange = downreg_res$log2FoldChange),
vertex.label.font=3)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Warning in FUN(X[[i]], ...): NAs introduced by coercion