Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, including those for imputation and highly-variable gene filtering. The results from Scanorama integration and batch correction can then be used as input to other tools for scRNA-seq clustering, visualization, and analysis.

Load package

library(Seurat)
library(Matrix)
library(stringr)
library(readr)
library(here)
library(fitdistrplus)
library(dplyr)
library(monocle)
library(reticulate)
library(pals)

Load data

load(file = "/Users/cuihening/Downloads/chicken_heart-master/data/cc.genes.rda")
samples = c("D4", "D7_LV", "D7_RV", "D10_LV", "D10_RV", "D14_LV", "D14_RV")
load("/Users/cuihening/Desktop/robjs/chicken_raw.Robj")
dim(chicken)
## [1] 18797 22315
table(chicken$orig.ident)
## 
## D10_LV D10_RV D14_LV D14_RV     D4  D7_LV  D7_RV 
##   2763   2427   1335   1674   5653   4555   3908
DefaultAssay(chicken)
## [1] "RNA"

Extract expression values for individual datasets function

GetAssayData: General accessor and setter functions for Assay objects. GetAssayData can be used to pull information from any of the expression matrices (eg. “counts”, “data”, or “scale.data”).

extractRNA_chicken <- function(seurat.object, sample_name){
  return(t(as.matrix(GetAssayData(seurat.object)))[colnames(seurat.object)[seurat.object$orig.ident == sample_name],])
}

Prepare data

data = list(extractRNA_chicken(chicken, samples[1]), extractRNA_chicken(chicken, samples[2]), extractRNA_chicken(chicken, samples[3]), extractRNA_chicken(chicken, samples[4]), extractRNA_chicken(chicken, samples[5]), extractRNA_chicken(chicken, samples[6]), extractRNA_chicken(chicken, samples[7]))
gene_list = list(rownames(chicken), rownames(chicken), rownames(chicken), rownames(chicken), rownames(chicken), rownames(chicken), rownames(chicken))

Import scanorama

I got multiple problem when I trying to set up the environment of scanorama. The default r-miniconda python does not works for me when calling the scanorama.I finally solved with use_python() methon to set python path to the anaconda environment.

path_to_python = '/Users/cuihening/opt/anaconda3/envs/r-reticulate/bin/python'
use_python(path_to_python)
scanorama = import_from_path('scanorama')

Integrate dataset

correct() setting all latent factor to median and reversing the regression model. Reverse regression model: a reverse regression approach to randomized clinical trials, with focus on the dependence of treatment assignment on the clinical outcomes of interest. A reverse regression model is essentially a semiparametric density ratio model for the outcome distributions in the two treatment groups. {r} [link] (https://www3.stat.sinica.edu.tw/sstest/oldpdf/A24n414.pdf)

integrated.corrected.data = scanorama$correct(data, gene_list, return_dimred=TRUE, return_dense=TRUE, ds_names = samples, verbose = TRUE)

save(integrated.corrected.data,file="/Users/cuihening/Desktop/robjs/corrected_norm_scano.Robj")

load("/Users/cuihening/Desktop/robjs/corrected_norm_scano.Robj")

Created integrate array from integrated data.

corrected_scanorama <- t(do.call(rbind, integrated.corrected.data[[2]]))
colnames(corrected_scanorama) <- colnames(chicken)
rownames(corrected_scanorama) <- integrated.corrected.data[[3]]
dim(corrected_scanorama)
## [1] 18797 22315
corrected_scanorama_pca <- t(do.call(rbind, integrated.corrected.data[[1]]))
colnames(corrected_scanorama_pca) <- colnames(chicken)
dim(corrected_scanorama_pca)
## [1]   100 22315

Data analysis

Create assay from integrated values and save to seurat object

scanorama_assay <- CreateAssayObject(data = corrected_scanorama)
chicken[["scanorama"]] <- scanorama_assay
DefaultAssay(chicken) <- "scanorama"

Preprocess scanorama values and perform PCA FindVaribaleFeatures:Identifies features that are outliers on a ’mean variability plot

PS: I encounter the memory limit issue in R and here is the solution library(usethis) usethis::edit_r_environ() R_MAX_VSIZE=100Gb then restart R

DefaultAssay(chicken)
## [1] "scanorama"
chicken <- FindVariableFeatures(chicken, assay = "scanorama", selection.method = "vst")
chicken <- ScaleData(chicken)
## Centering and scaling data matrix
chicken <- RunPCA(object = chicken, assay = "scanorama", verbose = F, reduction.name = "pca_scanorama")

Clustering and UMAP reduction on scanorama values

chicken <- FindNeighbors(object=chicken, assay = "scanorama", reduction = "pca_scanorama", dims = 1:20, k.param = 30, force.recalc = TRUE)
## Computing nearest neighbor graph
## Computing SNN
chicken <- FindClusters(object=chicken, resolution=0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22315
## Number of edges: 1380745
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9168
## Number of communities: 15
## Elapsed time: 6 seconds
table(chicken$scanorama_snn_res.0.5)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 4717 3842 1876 1864 1798 1722 1424 1290 1097  727  680  484  386  224  184
chicken <- RunUMAP(object = chicken, assay = "scanorama", reduction = "pca_scanorama", dims = 1:20, reduction.name = "umap_scanorama", metric = "correlation")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 23:30:00 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:30:00 Read 22315 rows and found 20 numeric columns
## 23:30:00 Using Annoy for neighbor search, n_neighbors = 30
## 23:30:00 Annoy build: subtracting row means for correlation
## 23:30:00 Building Annoy index with metric = correlation, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:30:02 Annoy search: subtracting row means for correlation
## 23:30:02 Writing NN index file to temp file /var/folders/l7/frm5yjw16mldyr4r2fldvrk80000gn/T//Rtmpyjct1u/file1507c30760874
## 23:30:02 Searching Annoy index using 1 thread, search_k = 3000
## 23:30:10 Annoy recall = 100%
## 23:30:10 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:30:11 Initializing from normalized Laplacian + noise (using irlba)
## 23:30:12 Commencing optimization for 200 epochs, with 987002 positive edges
## 23:30:27 Optimization finished

UMAP visualization

scanorama_snn_res.0.5

DimPlot(chicken, reduction = "umap_scanorama", label = TRUE, group.by = "scanorama_snn_res.0.5")

orig.ident

DimPlot(chicken, reduction = "umap_scanorama", group.by = "orig.ident")

save(chicken, file="/Users/cuihening/Desktop/robjs/chicken_normalised_scanorama2.Robj")

load(file="/Users/cuihening/Desktop/robjs/chicken_normalised_scanorama2.Robj")

Marker analysis

FindALLMarkers:Finds markers (differentially expressed genes) for each of the identity classes in a dataset

DefaultAssay(object = chicken) <- "RNA"
markers.all = FindAllMarkers(chicken, assay = "RNA", do.print = TRUE, logfc.threshold = 0.5, return.thresh = 0.1, min.pct = 0.5, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Warning in FindMarkers.default(object = data.use, slot = data.slot, counts =
## counts, : No features pass logfc.threshold threshold; returning empty data.frame
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14

I changed original avg_logFC into avg_log2FC, since the avg_logFC is not defined in the function.

markers.top10 = markers.all %>% group_by(cluster) %>% top_n(10, avg_log2FC)
markers.top20 = markers.all %>% group_by(cluster) %>% top_n(20, avg_log2FC)
write.csv(markers.all, "/Users/cuihening/Desktop/csv/markers.all.clusters.csv")

Use differential marker analysis to label clusters with cell type names

Idents(chicken) <- chicken$scanorama_snn_res.0.5
DimPlot(object = chicken, reduction = "umap_scanorama", label = TRUE)

chicken <- RenameIdents(chicken, `0` = "Fibroblast cells", `1` = "Cardiomyocytes-1", `2` = "Immature myocardial cells",
                                   `3` = "Endocardial cells", `4` = "Cardiomyocytes-2", `5` = "Valve cells",
                                   `6` = "TMSB4X high cells",
                                   `7` = "Epi-epithelial cells", `8` = "Erythrocytes", `9` = "Vascular endothelial cells", `10` = "Erythrocytes",
                                   `11` = "Mural cells", `12` = "Epi-mesenchymal cells", `13` = "MT-enriched cardiomyocytes", `14` = "Macrophages",
                                   `15` = "Erythrocytes", `16` = "Dendritic cells")
## Warning: Cannot find identity 16
## Warning: Cannot find identity 15
chicken$celltypes.0.5 <- Idents(chicken)
chicken.integrated <- chicken
save(chicken.integrated, file="/Users/cuihening/Desktop/robjs/chicken_normalised_scanorama3.Robj")
markers.all <- read.csv(file = "/Users/cuihening/Desktop/csv/markers.all.clusters.csv", row.names = 1)
markers.all <- subset(markers.all[!(rownames(markers.all) %in% grep("^ENSGAL", x = rownames(markers.all), value = TRUE)),])
markers.top5 = markers.all %>% group_by(cluster) %>% top_n(5, avg_log2FC)
levels(markers.top5$cluster) == levels(chicken.integrated$celltypes.0.5)
## logical(0)
pdf(file="allClustersDotplot.pdf",
    width= 6.7, height=2.5, paper="special", bg="transparent",
    fonts="Helvetica", colormodel = "rgb", pointsize=5, useDingbats = F)
DotPlot(chicken.integrated, features = unique(markers.top5$gene), cols = c("lightgray", "brown"), scale.by = "size", dot.scale = 2.0, dot.min = 0.01) + # scale_colour_viridis_c(direction = -1)+
  theme_bw() + scale_color_gradient(low = "lightgray", high = "brown", trans = "exp") + 
  theme(plot.background=element_blank(),
        panel.grid = element_line(size = 0.1),
        legend.position = "bottom",
        legend.title = element_text(colour = "black", size = 7, family = "Helvetica"), 
        legend.text = element_text(colour = "black", size = 6, family = "Helvetica"),
        legend.spacing = unit(0, "pt"),
        legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
        legend.box.margin=margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
        plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
        axis.title=element_blank(),
        axis.title.y=element_blank(),
        axis.text.y = element_text(colour = "black", size = 6, family = "Helvetica", angle = 0, color = rev(as.vector(kelly())[3:(2+length(levels(markers.top5$cluster)))])), # element_blank(), # 
        axis.text.x = element_text(colour = "black", size = 5.0, family = "Helvetica", angle = 45, vjust = 1, hjust = 1),
        plot.title=element_blank())
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## Warning: Removed 60 rows containing missing values (`geom_point()`).
dev.off()
## quartz_off_screen 
##                 2