Barkley/O’Connor RNA-Seq analysis

Barry Digby, Pilib O Broin

library(knitr)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(biomaRt)
library(tximport)
library(rhdf5)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:gplots':
## 
##     space

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(apeglm)
library(RColorBrewer)
library(IHW)
library(pheatmap)
library(PCAtools)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:IHW':
## 
##     alpha

## Loading required package: ggrepel

## 
## Attaching package: 'PCAtools'

## The following objects are masked from 'package:stats':
## 
##     biplot, screeplot

library(EnhancedVolcano)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

library(pvclust)
library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.8.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
##    in the original pheatmap() are identically supported in the new function. You 
##    can still use the original function by explicitly calling pheatmap::pheatmap().

## 
## Attaching package: 'ComplexHeatmap'

## The following object is masked from 'package:pheatmap':
## 
##     pheatmap

library(circlize)

## ========================================
## circlize version 0.4.12
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

library(fgsea)
library(DT)
library(clusterProfiler)

##

## clusterProfiler v4.0.0  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.

## 
## Attaching package: 'clusterProfiler'

## The following object is masked from 'package:IRanges':
## 
##     slice

## The following object is masked from 'package:S4Vectors':
## 
##     rename

## The following object is masked from 'package:biomaRt':
## 
##     select

## The following object is masked from 'package:stats':
## 
##     filter

library(ggplot2)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ tibble  3.1.2     ✓ purrr   0.3.4
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x ggplot2::alpha()          masks IHW::alpha()
## x IRanges::collapse()       masks dplyr::collapse()
## x Biobase::combine()        masks BiocGenerics::combine(), dplyr::combine()
## x matrixStats::count()      masks dplyr::count()
## x IRanges::desc()           masks dplyr::desc()
## x tidyr::expand()           masks S4Vectors::expand()
## x clusterProfiler::filter() masks dplyr::filter(), stats::filter()
## x S4Vectors::first()        masks dplyr::first()
## x dplyr::lag()              masks stats::lag()
## x ggplot2::Position()       masks BiocGenerics::Position(), base::Position()
## x purrr::reduce()           masks GenomicRanges::reduce(), IRanges::reduce()
## x clusterProfiler::rename() masks S4Vectors::rename(), dplyr::rename()
## x clusterProfiler::select() masks biomaRt::select(), dplyr::select()
## x purrr::simplify()         masks clusterProfiler::simplify()
## x clusterProfiler::slice()  masks IRanges::slice(), dplyr::slice()

library(ggpubr)

Sample Metadata

samples <- read.csv("/data/projects/D_O_Connor/reformat_samples.csv", header=T, row.names = "samples")
samples$ER <- gsub("_negative", "-", samples$ER)
samples$ER <- gsub("_positive", "+", samples$ER)
samples$LVI <- gsub("_negative", "-", samples$LVI)
samples$LVI <- gsub("_positive", "+", samples$LVI)
samples$PR <- gsub("_negative", "-", samples$PR)
samples$PR <- gsub("_positive", "+", samples$PR)
samples$Condition <- gsub("Tumour", "TASC", samples$Condition)
samples$Condition <- gsub("Normal", "TAN", samples$Condition)
DT::datatable(samples)

dir <- ("/data/projects/D_O_Connor/quantification")
files <- file.path(dir, rownames(samples), "abundance.h5")
names(files) <- paste0(rownames(samples))
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
tx2gene <- getBM(attributes = c("ensembl_transcript_id_version", "hgnc_symbol"), mart = mart)
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene)
samples$Patient <- as.factor(samples$Patient)
samples$Condition <- as.factor(samples$Condition)
dds <- DESeqDataSetFromTximport(txi, colData = samples, design = ~ Patient + Condition)
dds$Condition <- relevel(dds$Condition, ref = "TAN")
dds <- DESeq(dds)

Exploratory Data analysis

Exploratory data analysis (EDA) involves transforming the count data set to visually explore sample-sample relationships. It is important to note that this step is separate to the statistical analysis step, which uses raw count data. EDA can be used to identify poor quality samples and to identify sources of variation which must be accounted for when designing our model for DESeq2's Generalized Linear Model.

Data Transformations

Firstly, we will choose which transformation to apply to the data set: Logarithm Base 2 + Pseudocount log2, Variance Stabilizing Transformation vst or Regularized-Logarithm Transformation rlog. All three transformations were applied to the data set and inspected using meanSDPlots, which plots the relationship between row standard deviations versus row means.

log2

library(vsn)
library(hexbin)

logd <- log2(counts(dds, normalized=TRUE) + 1)
vsd <- assay(vst(dds, blind=FALSE))
rld <- assay(rlog(dds, blind=FALSE))

z <- meanSdPlot(logd, ranks=FALSE, plot=FALSE)
z$gg + ggtitle("Logarithm Base 2 + Pseudocount (+1)")

vst

x <- meanSdPlot(vsd, ranks = FALSE, plot = FALSE)
x$gg + ggtitle("Variance Stabilizing Transformation")

rlog

y <- meanSdPlot(rld, ranks = FALSE, plot = FALSE)
y$gg + ggtitle("Regularized-Logarithm Transformation")

I use the term ‘counts’ below referring to the mean (mean row gene counts) on the x-axis. The legend name ‘counts’ is slightly confusing and could be thought of as ‘density’ instead of ‘counts’

The Logarithm Base 2 + Pseudocount plot shows an increase in variance in genes with low counts. When computing a sample-sample distance matrix, these genes will contribute greatly to the distance metrics generated, which is not what we want. Low count genes are not overly informative. The reason this is happening is taking the logarithm of a small number will overinflate it’s value.

In the two plots given above, I would say that the rlog transformation is more appropriate for our EDA. The vst plot is bloated towards the bottom left of the plot, genes with low counts (row means i.e x-axis) have a high standard deviation. These genes will contribute towards the variation in our data set when computing a distance matrix, which we don’t want.

The rlog plot shows more of a funnel/cone shape, genes with a lower count have a lower standard deviation in this transformation. This is exactly what we want - genes with high (informative) counts to contribute to the variation in the dataset when computing the distance matrix!

Sample Heatmaps

sampleDists <- dist(t(vsd))
sampleDistMatrix <- as.matrix(sampleDists)
colnames(sampleDistMatrix) <- NULL
pheatmap(mat=sampleDistMatrix,
         show_rownames = TRUE,
         cluster_cols = TRUE,
         cluster_rows = TRUE,
         annotation_col = samples,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255))

PCA

p <- pca(vsd, metadata = samples, center = TRUE, removeVar = 0.1)

biplot(p,
       
       # Loadings
       showLoadings = FALSE,
       ntopLoadings = 5,
       showLoadingsNames = TRUE,
       colLoadingsNames = 'black',
       colLoadingsArrows = 'black',
       alphaLoadingsArrow = 0.5,
       sizeLoadingsNames = 4,
       boxedLoadingsNames = FALSE,
       drawConnectorsLoadings = TRUE,
       widthConnectorsLoadings = 1,
       colConnectorsLoadings = 'green',
       
       # color points
       colby = 'Condition',
       colkey = c("TAN"="royalblue", "TASC"="red"),
       colLegendTitle = "TSC vs. TSC-SC",
       shape = NULL,
       shapekey = c("ER-"=1, "ER+"=3),
       shapeLegendTitle = "shapes",
       pointSize = 5,
       
       # legends
       legendPosition = "right",
       legendLabSize = 10,
       legendTitleSize = 10,
       legendIconSize = 4,
       
       # Encircle (does not try to force fit)
       encircle = FALSE,
       encircleFill = FALSE,
       encircleFillKey = NULL,
       encircleAlpha = 0.8,
       encircleLineSize = 1,
       encircleLineCol = 'black',
       
       # ellipse
       ellipse = FALSE,
       ellipseType = "t",
       ellipseFill = FALSE,
       ellipseFillKey = NULL,
       ellipseAlpha = 0.8,
       ellipseLineSize = 1,
       ellipseLineCol = 'black',
       
       # plot dims
       #xlim = c(-50,50),
       #ylim = c(-30,30),
       hline = c(10,20,30),
       hlineType = c("dashed", "longdash", "twodash"),
       hlineWidth = 1,
       hlineCol = 'black',
       vline = 10,
       vlineType = c("blank"),
       vlineCol = 'black', 
       gridlines.major = TRUE,
       gridlines.minor = TRUE,
       borderWidth = 1,
       borderColour = 'black',
       
       # labs
       xlab = "foo",
       ylab = "bar", 
       title = "lala",
       subtitle = "llelele",
       caption = "diggity",
       titleLabSize = 20,
       subtitleLabSize = 10,
       captionLabSize = 5,
       lab = rownames(samples),
       labSize = 2,
       boxedLabels = FALSE,
       selectLab = NULL,
       drawConnectors = TRUE)