# libraries
library(Seurat) # for scrnaseq analyisis
## Attaching SeuratObject
library(here) # for reproducible paths
## here() starts at C:/Users/nbestard/OneDrive/Laura
srt_old <- readRDS(here("Data", "HCA", "srt_opcs.RDS"))
srt_dev <- readRDS(here("Data", "Embryos", "hca_dev_scanpy_harmony_opcs.rds"))
The seurat adult object contain the sum of spliced and unspliced in the counts slot, and it has been processed with scater QC, scran normalisation, feature selection, scaling, dimensional reduction, annotation.
In the Embryonic dataset the counts slot also contain the summed spliced and unspliced matrices. Sometimes alevin algorithm gives non-integers for counts! The data slot has the normalized and scaled data from scanpy. It includes a pca reductions from harmony where an integration by donor was done.
The data slot contains the normalized data, the counts the raw data and the scale the scaled data. There are 84 entries in the metadata.
dim(srt_old$RNA@data)
## [1] 19280 3391
# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_old$RNA@data[10:15,50:53]
## 6 x 4 sparse Matrix of class "dgCMatrix"
## BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 0.8772232
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 1.104579
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
srt_old$RNA@counts[10:15,50:53]
## 6 x 4 sparse Matrix of class "dgCMatrix"
## BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 1
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 1
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
srt_old$RNA@scale.data[10:15,50:53]
## BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 -0.4193383
## GRIK3 -0.2666249
## HEYL -0.1188837
## BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 -0.4193383
## GRIK3 -0.2666249
## HEYL -0.1188837
## BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 1.1838427
## GRIK3 2.3162946
## HEYL -0.1188837
## BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 -0.4193383
## GRIK3 -0.2666249
## HEYL -0.1188837
colnames(srt_old@meta.data)
## [1] "orig.ident" "nCount_RNA"
## [3] "nFeature_RNA" "process_number"
## [5] "Barcode" "caseNO"
## [7] "Tissue" "gender"
## [9] "Age" "BBN"
## [11] "PMI" "CauseOfDeath_category"
## [13] "CauseOfDeath_1a" "CauseOfDeath_1b"
## [15] "CauseOfDeath_1c" "CauseOfDeath_1d"
## [17] "Cryostat_date" "SlideBox"
## [19] "Location_Block" "comments"
## [21] "RNAconcRIN" "RINvalue"
## [23] "RINvalueDate" "RINvalueRemeasured"
## [25] "rEeXTRACT" "X10XBatch"
## [27] "SequencingPool" "project_name"
## [29] "uniq_id" "percent.mt"
## [31] "ident" "sum"
## [33] "detected" "total"
## [35] "low_lib_size" "large_lib_size"
## [37] "low_n_features" "high_n_features"
## [39] "high_subsets_mito_percent" "discard"
## [41] "ProcessNumber" "ScaterQC_failed"
## [43] "sizeFactor" "scDblFinder_weighted"
## [45] "scDblFinder_ratio" "scDblFinder_class"
## [47] "scDblFinder_score" "total_percent_mito"
## [49] "percent_ribo" "RNA_snn_res.0.5"
## [51] "seurat_clusters" "RNA_snn_res.0.8"
## [53] "CountsPerCluster_res_0_8" "IdCountGroup"
## [55] "samplesPerCluster_res_0_8" "sample_count_per_cl_group"
## [57] "AgeGroup" "RIN"
## [59] "ageSex" "S.Score"
## [61] "G2M.Score" "Phase"
## [63] "old.ident" "RNA_snn_res.0.1"
## [65] "rough_annot" "sample_qc_failed"
## [67] "QC_UMI" "caseNO_tissue"
## [69] "clusters_0.8" "clusters_0.5"
## [71] "RNA_snn_res.0.9" "clusters_0.9"
## [73] "RNA_snn_res.1" "clusters_1"
## [75] "RNA_snn_res.1.1" "clusters_1.1"
## [77] "RNA_snn_res.1.2" "clusters_1.2"
## [79] "RNA_snn_res.1.3" "clusters_1.3"
## [81] "RNA_snn_res.1.4" "clusters_1.4"
## [83] "RNA_snn_res.1.5" "clusters_named"
DimPlot(srt_old)
The data slot contains the scaled data and the counts the raw data. There is no normalized non-scaled data. The metadata contains 17 entries
dim(srt_dev$RNA@data)
## [1] 26428 388
dim(srt_dev$RNA@counts)
## [1] 26428 388
dim(srt_dev$RNA@scale.data)
## [1] 0 0
# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_dev$RNA@counts[50:55,1:3]
## 6 x 3 sparse Matrix of class "dgCMatrix"
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 .
## B3GALT6 .
## C1QTNF12 .
## UBE2J2 1
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 .
## B3GALT6 1
## C1QTNF12 .
## UBE2J2 .
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 .
## B3GALT6 .
## C1QTNF12 .
## UBE2J2 .
srt_dev$RNA@data[50:55,1:3]
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103
## TNFRSF18 -0.02794619
## TNFRSF4 -0.01033824
## SDF4 -0.24113661
## B3GALT6 -0.24471466
## C1QTNF12 -0.01850932
## UBE2J2 1.42258894
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103
## TNFRSF18 -0.02794619
## TNFRSF4 -0.01033824
## SDF4 -0.24113661
## B3GALT6 2.89962769
## C1QTNF12 -0.01850932
## UBE2J2 -0.39016941
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103
## TNFRSF18 -0.02794619
## TNFRSF4 -0.01033824
## SDF4 -0.24113661
## B3GALT6 -0.24471466
## C1QTNF12 -0.01850932
## UBE2J2 -0.39016941
colnames(srt_dev@meta.data)
## [1] "orig.ident" "nCount_RNA" "nFeature_RNA"
## [4] "cell_barcode" "sampleid" "gender"
## [7] "age" "xbatch" "xversion"
## [10] "sequecingpool" "uniq_id" "n_genes"
## [13] "n_genes_by_counts" "total_counts" "total_counts_mt"
## [16] "pct_counts_mt" "doublet_scores"
head(srt_dev@meta.data)
## orig.ident nCount_RNA
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 SeuratProject 11687.043
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 SeuratProject 11493.330
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 SeuratProject 12014.238
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 SeuratProject 21195.432
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 SeuratProject 14419.953
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 SeuratProject 8690.293
## nFeature_RNA
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 4226
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 4566
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 4570
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 6145
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 5246
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 3859
## cell_barcode
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 TTATTGCTCAACTGAC_12006WApool01-N__70_13103
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 GCTACAAAGGTACTGG_12006WApool01-N__70_13103
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 TTTCACAGTCTCTCCA_12006WApool01-N__70_13103
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285
## sampleid
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 12006WApool01-N__70_13103
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 12006WApool01-N__70_13103
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 12006WApool01-N__70_13103
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 14771WApool01-N__HCA_71_14285
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 14771WApool01-N__HCA_71_14285
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 14771WApool01-N__HCA_71_14285
## gender age xbatch xversion
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 M 19pcw 10 3
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 M 19pcw 10 3
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 M 19pcw 10 3
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 F 20pcw 12 3
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 F 20pcw 12 3
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 F 20pcw 12 3
## sequecingpool uniq_id n_genes
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 nan HCA_13103 4226
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 nan HCA_13103 4566
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 nan HCA_13103 4571
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 nan HCA_14285 6145
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 nan HCA_14285 5248
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 nan HCA_14285 3860
## n_genes_by_counts total_counts
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 4226 11687.043
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 4566 11493.330
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 4570 12014.238
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 6145 21195.432
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 5246 14419.953
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 3859 8690.293
## total_counts_mt pct_counts_mt
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 18.0000 0.1540167
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 119.0000 1.0353832
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 84.0000 0.6991705
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 172.4706 0.8137158
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 65.0000 0.4507643
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 9.0000 0.1035638
## doublet_scores
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103 0.03379722
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103 0.07428571
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103 0.20000000
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 0.12903226
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 0.35714286
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 0.12903226
DimPlot(srt_dev)
In both objects we need to delete the scaled data, as there are both subsets of bigger datasets. We also select only the genes in common between both datasets.
Not all the metadata is usefull, for example all the different clusterings are not required as they were done with other cells to subset the OPCs.
# Create vector with genes common in both objects
features <- intersect(rownames(srt_old), rownames(srt_dev))
# Subset both objects with only these features
srt_old <- subset(srt_old, features = features)
srt_dev <- subset(srt_dev, features = features)
# Subset metadata of interest from the developmental HCA
keep_metadata_dev <-
c(
"cell_barcode",
"sampleid",
"gender",
"age",
"xbatch",
"xversion",
"sequecingpool",
"uniq_id",
"n_genes",
"total_counts",
"pct_counts_mt",
"doublet_scores"
)
srt_dev@meta.data <- srt_dev@meta.data[, keep_metadata_dev]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_dev$detected_all_celltypes <- srt_dev$n_genes
srt_dev$total_all_celltypes <- srt_dev$total_counts
srt_dev$n_genes <- NULL
srt_dev$total_counts <- NULL
# Correct name typo
srt_dev$seq_pool <- srt_dev$sequecingpool
# Add column that indicates that these are embryos
srt_dev$AdultVsEmbryo <- "embryo"
srt_dev$AgeGroup <- "Embryo"
# Delete the data slot (with counts normalized and scaled) to be sure it
# won't be used, as it is not correct for this subset
srt_dev[["RNA"]]@data <- matrix(
data = NA,
nrow = dim(srt_dev)[1],
ncol = dim(srt_dev)[2],
dimnames = list(rownames(srt_dev), colnames(srt_dev))
)
# Subset metadata of interest from the HCA old object
keep_metadata_old <-
c(
"process_number",
"Barcode",
"detected",
"total",
"caseNO",
"Tissue",
"gender",
"Age",
"BBN",
"PMI",
"CauseOfDeath_category",
"X10XBatch",
"SequencingPool",
"total_percent_mito",
"AgeGroup",
"RIN" ,
"ageSex",
"uniq_id",
"scDblFinder_class",
"scDblFinder_score"
)
srt_old@meta.data <- srt_old@meta.data[, keep_metadata_old]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_old$detected_all_celltypes <- srt_old$detected
srt_old$total_all_celltypes <- srt_old$total
srt_old$detected <- NULL
srt_old$total <- NULL
# Change some names to match the other object naming
srt_old$age <- srt_old$Age
srt_old$cell_barcode <- srt_old$Barcode
srt_old$seq_pool <- srt_old$SequencingPool
srt_old$xbatch <- srt_old$X10XBatch
srt_old$pct_counts_mt <- srt_old$total_percent_mito
srt_old$doublet_scores <- srt_old$scDblFinder_score
# Delete the previous name
srt_old$Age <- NULL
srt_old$Barcode <- NULL
srt_old$SequencingPool <- NULL
srt_old$X10XBatch <- NULL
srt_old$total_percent_mito <- NULL
srt_old$scDblFinder_score <- NULL
# Add column that indicates that these are adults
srt_old$AdultVsEmbryo <- "adult"
# Delete the scaled slot as it is not correct for this subset
srt_old[["RNA"]]@scale.data <- as.matrix(NA)
The integration is performed following Seurat’s vignette, that performs the integration as described in (Sturart2019?).
Some changes include:
Setting the k.filter to NA, as suggested in this comment for small datasets. The default is 200, and eventhough the comment was for datasets with less than 200 cells, I feel it is still relevant for this dataset that just contains 388 features.
k.anchor = 10. These datasets differ so it might benefit from a more “aggressive” integration. Even if it duplicates Seurat’s default this value is still very reasonable. For example the default in fastMNN, from Bioconductor is 20 (even if it is not exactly the same algorithm I think it is comparable).
# Create list with objects
srt_list <- list(srt_dev, srt_old)
# And remove the previous objects to save space
rm(srt_dev, srt_old)
# normalize and identify variable features for each dataset independently
srt_list<- lapply(X = srt_list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
var_features <- SelectIntegrationFeatures(object.list = srt_list)
# create anchors to integrate the datasets
anchors <- FindIntegrationAnchors(object.list = srt_list, anchor.features = var_features, k.anchor = 10, k.filter = NA )
# create an 'integrated' data assay combining srt_dev and srt_old
srt_combined <- IntegrateData(anchorset = anchors)
# specify that we will perform downstream analysis on the corrected data, the original unmodified data still resides in the 'RNA' assay
DefaultAssay(srt_combined) <- "integrated"
Idents(srt_combined) <- srt_combined$AdultVsEmbryo
# Run the standard workflow for visualization and clustering
srt_combined <- ScaleData(srt_combined, verbose = FALSE)
srt_combined <- RunPCA(srt_combined, verbose = FALSE)
# Choose the number of PCs
ElbowPlot(srt_combined, ndims = 50)
# 20
srt_combined <- RunUMAP(srt_combined, reduction = "pca", dims = 1:20)
DimPlot(srt_combined)
#srt_combined <- FindNeighbors(srt_combined, reduction = "pca", dims = 1:20)
#srt_combined <- FindClusters(srt_combined, resolution = 0.5)