remake their example from the link.

library(magrittr)
cts <- read.table("~/bottomly_count_table.txt", header=TRUE, row.names=1)
coldata <- read.table("~/bottomly_phenodata.txt", header=TRUE, row.names=1)
all(colnames(cts) == rownames(coldata))
## [1] TRUE
coldata$strain %<>% (function(x) sub("/",".",x))
coldata$strain %<>% factor
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'GenomicRanges'
## The following object is masked from 'package:magrittr':
## 
##     subtract
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
dds <- DESeqDataSetFromMatrix(cts, coldata, ~strain)
dds$batch <- factor(dds$experiment.number)
table(dds$strain, dds$batch)
##           
##            4 6 7
##   C57BL.6J 3 4 3
##   DBA.2J   4 3 4
dds <- estimateSizeFactors(dds)
norm.cts <- counts(dds, normalized=TRUE)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:MatrixGenerics':
## 
##     rowSds, rowVars
## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars
## Loading required package: BiocParallel
mm <- model.matrix(~ strain, colData(dds))
mm0 <- model.matrix(~ 1, colData(dds))
norm.cts <- norm.cts[rowSums(norm.cts) > 0,]
fit <- svaseq(norm.cts, mod=mm, mod0=mm0, n.sv=2)
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
library(rafalib)
bigpar()
dds$strain.int <- as.integer(dds$strain) + 15
plot(fit$sv[,1:2], col=dds$batch, pch=dds$strain.int, cex=2,
     xlab="SV1", ylab="SV2")
legend("top", levels(dds$batch), pch=16,
       col=1:3, cex=.8, ncol=3, title="batch")

sanity checks

lets take a look at the eigen cor plot first.

pca = PCAtools::pca(varianceStabilizingTransformation(counts(dds)), metadata = coldata)
PCAtools::eigencorplot(pca, components = PCAtools::getComponents(pca, 1:10),
    metavars = c('strain','experiment.number'),
    col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
    cexCorval = 1.2, fontCorval = 2, posLab = 'all', rotLabX = 45,
    scale = TRUE,
    main = bquote(PC ~ Pearson ~ r^2 ~ clinical ~ correlates),
    plotRsquared = TRUE,
    corFUN = 'pearson',
    corUSE = 'pairwise.complete.obs',
    corMultipleTestCorrection = 'BH',
    signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))
## Warning in PCAtools::eigencorplot(pca, components =
## PCAtools::getComponents(pca, : strain is not numeric - please check the source
## data as non-numeric variables will be coerced to numeric

pca$variance[1]
##      PC1 
## 27.51171

you can see that experiment.number is correlated with PC1 which explains 27% variance in the dataset. not good…


we can correlate the surrogate variables against the principal components

cor(pca$rotated$PC1, fit$sv[,1]) #pc1 vs sv1
## [1] 0.9607585
cor(pca$rotated$PC2, fit$sv[,1]) #pc2 vs sv1
## [1] 0.2690725
cor(pca$rotated$PC1, fit$sv[,2]) #pc1 vs sv2
## [1] 0.0394498
cor(pca$rotated$PC2, fit$sv[,2]) #pc2 vs sv2
## [1] -0.2592778

strong correlation between sv1 and pc1, which we saw is the ‘batch effect’ for experiment.number. Just another way of confirming that in their toy example, sv 1 did a good job of capturing the batch effect.


In your case, you dont have this batch effect in your metadata file im guessing.

what you need to do when you conduct svaseq is double check by using cor(PC$rotated$PCn vs fit$SVn) that the surrogate variables are not correlated with any biological variables!

if they are not, then include them in your DESeq2 model - happy days.