library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qc and visualisation
library(scran) # For normalisation
library(Matrix) # For log transorming the raw data
library(ggplot2) # To add titles to plots
library(batchelor) # Batch correction
# Adapted function from VISION to log tranform sparse matrix
# I could not download the package
matLog2 <- function(spmat, scale = FALSE, scaleFactor = 1e6) {
if (scale == TRUE) {
spmat <- t( t(spmat) / colSums(spmat)) * scaleFactor
}
if (is(spmat, "sparseMatrix")) {
matsum <- summary(spmat)
logx <- log2(matsum$x + 1)
logmat <- sparseMatrix(i = matsum$i, j = matsum$j,
x = logx, dims = dim(spmat),
dimnames = dimnames(spmat))
} else {
logmat <- log2(spmat + 1)
}
return(logmat)
}
project <- "fire-mice"
Combat was shown to outperform other batch correction methods for simple batch correction (Buttner et. al, 2019). However, this will also regress other biological differences that are not well balanced between batches. Integration techniques account for this fact, with the downside that it can lead to over-correction due to increased degrees of freedom of these non-linear methods.
We use a merge tree, useful for merging together batches that are known to be more closely related (same tissue) before attempting difficult merges involving more dissimilar batches.
if (!(file.exists(
here("processed", project, "sce_corrected.RDS")
))) {
sce <- readRDS(here("processed", project, "sce_dimred_01.RDS"))
set.seed(100)
sce <- correctExperiments(sce,
batch = factor(sce$chip),
subset.row = rowSubset(sce),
correct.all=TRUE,
PARAM = FastMnnParam(
merge.order =
list(list("3","5"), list("4","6")),
d = 25,
prop.k=0.05
)
)
} else {
sce <- readRDS(here("processed", project, "sce_corrected.RDS"))
}
For seek of time this is skipped -This plot is not meant to be used with the new gene values, as these are just a reconstruction that should only be used for visual purpose. I was still curious to see how it would look like.-
# Only compute if first time
if (!(file.exists(here("processed", project, "variance_explained_corrected.RDS")))) {
# Log transfrom the corrected values
assay(sce, "log_reconstructed") <- matLog2(assay(sce, "reconstructed"))
# Calculate the matrix (time consuming step)
var_corrected <- getVarianceExplained(
sce,
exprs_values = "log_reconstructed",
variables = c(
"tissue",
"chip",
"age",
"genotype",
"mouse",
"total"
)
)
saveRDS(var_corrected, here("processed", project, "variance_explained_corrected.RDS"))
#If not just load created object
} else{
var_corrected <- readRDS(here("processed", project, "variance_explained_corrected.RDS"))
}
plotExplanatoryVariables(var_corrected)
plotReducedDim(sce, colour_by= "chip", point_size=0.1, dimred = "corrected") +
ggtitle("After batch correction, small dots")
if (!(file.exists(
here("processed", project, "sce_corrected.RDS")
))) {
#keep the previous dimensional reduction just in case
reducedDim(sce, "TSNE_uncorrected") <- reducedDim(sce, "TSNE")
set.seed(100)
sce <- runTSNE(sce, dimred="corrected")
}
# plot uit
plotReducedDim(sce, colour_by= "chip", point_size=0.1, dimred = "TSNE_uncorrected") +
ggtitle("TSNE dimensional reduction before correction")
plotReducedDim(sce, colour_by= "chip", point_size=0.1, dimred = "TSNE") +
ggtitle("TSNE dimensional reduction corrected")
One useful diagnostic is the proportion of variance within each batch that is lost during MNN correction. Specifically, this refers to the within-batch variance that is removed during orthogonalization with respect to the average correction vector at each merge step. This is in the metadata, which contains a matrix of the variance lost in each batch (column) at each merge step (row).
Large proportions of lost variance (>10%) suggest that correction is removing genuine biological heterogeneity. This would occur due to violations of the assumption of orthogonality between the batch effect and the biological subspace. In this case, the proportion of lost variance is small, indicating that non-orthogonality is not a major concern.
metadata(sce)$merge.info$lost.var
## 3 4 5 6
## [1,] 0.00000000 0.02466918 0.00000000 0.02541417
## [2,] 0.01242760 0.00000000 0.01344079 0.00000000
## [3,] 0.02402459 0.02968414 0.02588013 0.03242801
if (!(file.exists(
here("processed", project, "sce_corrected.RDS")
))) { saveRDS(sce, here("processed", project, "sce_corrected.RDS"))}
Click to expand
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] batchelor_1.8.0 Matrix_1.3-4
## [3] scran_1.20.1 scater_1.20.1
## [5] ggplot2_3.3.5 scuttle_1.2.1
## [7] SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
## [9] Biobase_2.52.0 GenomicRanges_1.44.0
## [11] GenomeInfoDb_1.28.1 IRanges_2.26.0
## [13] S4Vectors_0.30.0 BiocGenerics_0.38.0
## [15] MatrixGenerics_1.4.2 matrixStats_0.60.0
## [17] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 rprojroot_2.0.2
## [3] tools_4.1.1 bslib_0.2.5.1
## [5] utf8_1.2.2 R6_2.5.0
## [7] irlba_2.3.3 ResidualMatrix_1.2.0
## [9] vipor_0.4.5 DBI_1.1.1
## [11] colorspace_2.0-2 withr_2.4.2
## [13] tidyselect_1.1.1 gridExtra_2.3
## [15] compiler_4.1.1 BiocNeighbors_1.10.0
## [17] DelayedArray_0.18.0 labeling_0.4.2
## [19] sass_0.4.0 scales_1.1.1
## [21] stringr_1.4.0 digest_0.6.27
## [23] rmarkdown_2.10 XVector_0.32.0
## [25] pkgconfig_2.0.3 htmltools_0.5.1.1
## [27] sparseMatrixStats_1.4.2 highr_0.9
## [29] limma_3.48.2 rlang_0.4.11
## [31] DelayedMatrixStats_1.14.2 farver_2.1.0
## [33] jquerylib_0.1.4 generics_0.1.0
## [35] jsonlite_1.7.2 BiocParallel_1.26.1
## [37] dplyr_1.0.7 RCurl_1.98-1.3
## [39] magrittr_2.0.1 BiocSingular_1.8.1
## [41] GenomeInfoDbData_1.2.6 Rcpp_1.0.7
## [43] ggbeeswarm_0.6.0 munsell_0.5.0
## [45] fansi_0.5.0 viridis_0.6.1
## [47] lifecycle_1.0.0 stringi_1.7.3
## [49] yaml_2.2.1 edgeR_3.34.0
## [51] zlibbioc_1.38.0 Rtsne_0.15
## [53] grid_4.1.1 dqrng_0.3.0
## [55] crayon_1.4.1 lattice_0.20-44
## [57] cowplot_1.1.1 beachmat_2.8.0
## [59] locfit_1.5-9.4 metapod_1.0.0
## [61] knitr_1.33 pillar_1.6.2
## [63] igraph_1.2.6 ScaledMatrix_1.0.0
## [65] glue_1.4.2 evaluate_0.14
## [67] vctrs_0.3.8 gtable_0.3.0
## [69] purrr_0.3.4 assertthat_0.2.1
## [71] xfun_0.25 rsvd_1.0.5
## [73] viridisLite_0.4.0 tibble_3.1.3
## [75] beeswarm_0.4.0 cluster_2.1.2
## [77] bluster_1.2.1 statmod_1.4.36
## [79] ellipsis_0.3.2