# libraries
library(Seurat) # for scrnaseq analyisis
## Attaching SeuratObject
library(here) # for reproducible paths
## here() starts at C:/Users/nbestard/OneDrive/Laura
srt_old <- readRDS(here("Data", "HCA", "srt_opcs.RDS"))
srt_dev <- readRDS(here("Data", "Embryos", "hca_harmony_opcs.rds"))
The seurat adult object contain the sum of spliced and unspliced in the counts slot, and it has been processed with scater QC, scran normalisation, feature selection, scaling, dimensional reduction, annotation.
In the Embryonic dataset the counts slot also contain the summed spliced and unspliced matrices. Sometimes alevin algorithm gives non-integers for counts! The data slot has the normalized data from scanpy. It includes a pca reductions from harmony where an integration by batch was done.
The data slot contains the normalized data, the counts the raw data and the scale the scaled data. There are 84 entries in the metadata.
dim(srt_old$RNA@data)
## [1] 19280 3391
# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_old$RNA@data[10:15,50:53]
## 6 x 4 sparse Matrix of class "dgCMatrix"
## BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 0.8772232
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 1.104579
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
srt_old$RNA@counts[10:15,50:53]
## 6 x 4 sparse Matrix of class "dgCMatrix"
## BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 1
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 1
## INTS11 .
## DVL1 .
## BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4 .
## C1QTNF12 .
## UBE2J2 .
## ACAP3 .
## INTS11 .
## DVL1 .
srt_old$RNA@scale.data[10:15,50:53]
## BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 -0.4193383
## GRIK3 -0.2666249
## HEYL -0.1188837
## BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 -0.4193383
## GRIK3 -0.2666249
## HEYL -0.1188837
## BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 1.1838427
## GRIK3 2.3162946
## HEYL -0.1188837
## BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## RUNX3 -0.1095590
## IFI6 -0.3099379
## LAPTM5 -0.2745048
## CSMD2 -0.4193383
## GRIK3 -0.2666249
## HEYL -0.1188837
colnames(srt_old@meta.data)
## [1] "orig.ident" "nCount_RNA"
## [3] "nFeature_RNA" "process_number"
## [5] "Barcode" "caseNO"
## [7] "Tissue" "gender"
## [9] "Age" "BBN"
## [11] "PMI" "CauseOfDeath_category"
## [13] "CauseOfDeath_1a" "CauseOfDeath_1b"
## [15] "CauseOfDeath_1c" "CauseOfDeath_1d"
## [17] "Cryostat_date" "SlideBox"
## [19] "Location_Block" "comments"
## [21] "RNAconcRIN" "RINvalue"
## [23] "RINvalueDate" "RINvalueRemeasured"
## [25] "rEeXTRACT" "X10XBatch"
## [27] "SequencingPool" "project_name"
## [29] "uniq_id" "percent.mt"
## [31] "ident" "sum"
## [33] "detected" "total"
## [35] "low_lib_size" "large_lib_size"
## [37] "low_n_features" "high_n_features"
## [39] "high_subsets_mito_percent" "discard"
## [41] "ProcessNumber" "ScaterQC_failed"
## [43] "sizeFactor" "scDblFinder_weighted"
## [45] "scDblFinder_ratio" "scDblFinder_class"
## [47] "scDblFinder_score" "total_percent_mito"
## [49] "percent_ribo" "RNA_snn_res.0.5"
## [51] "seurat_clusters" "RNA_snn_res.0.8"
## [53] "CountsPerCluster_res_0_8" "IdCountGroup"
## [55] "samplesPerCluster_res_0_8" "sample_count_per_cl_group"
## [57] "AgeGroup" "RIN"
## [59] "ageSex" "S.Score"
## [61] "G2M.Score" "Phase"
## [63] "old.ident" "RNA_snn_res.0.1"
## [65] "rough_annot" "sample_qc_failed"
## [67] "QC_UMI" "caseNO_tissue"
## [69] "clusters_0.8" "clusters_0.5"
## [71] "RNA_snn_res.0.9" "clusters_0.9"
## [73] "RNA_snn_res.1" "clusters_1"
## [75] "RNA_snn_res.1.1" "clusters_1.1"
## [77] "RNA_snn_res.1.2" "clusters_1.2"
## [79] "RNA_snn_res.1.3" "clusters_1.3"
## [81] "RNA_snn_res.1.4" "clusters_1.4"
## [83] "RNA_snn_res.1.5" "clusters_named"
DimPlot(srt_old)
The data slot contains the scaled data and the counts the raw data. There is no normalized non-scaled data. The metadata contains 17 entries
dim(srt_dev$RNA@data)
## [1] 26428 457
dim(srt_dev$RNA@counts)
## [1] 26428 457
dim(srt_dev$RNA@scale.data)
## [1] 0 0
# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_dev$RNA@counts[50:55,1:3]
## 6 x 3 sparse Matrix of class "dgCMatrix"
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 .
## B3GALT6 1
## C1QTNF12 .
## UBE2J2 .
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 1
## B3GALT6 .
## C1QTNF12 .
## UBE2J2 2
## CATACCCAGCACACAG_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 4
## B3GALT6 3
## C1QTNF12 .
## UBE2J2 4
srt_dev$RNA@data[50:55,1:3]
## 6 x 3 sparse Matrix of class "dgCMatrix"
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 .
## B3GALT6 0.2504616
## C1QTNF12 .
## UBE2J2 .
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 0.2680517
## B3GALT6 .
## C1QTNF12 .
## UBE2J2 0.4792293
## CATACCCAGCACACAG_12006WApool01-N__70_13103
## TNFRSF18 .
## TNFRSF4 .
## SDF4 0.7698283
## B3GALT6 0.6256959
## C1QTNF12 .
## UBE2J2 0.7698283
colnames(srt_dev@meta.data)
## [1] "orig.ident" "nCount_RNA" "nFeature_RNA"
## [4] "cell_barcode" "sampleid" "sex"
## [7] "age" "xbatch" "xversion"
## [10] "seqpool" "uniq_id" "n_genes"
## [13] "n_genes_by_counts" "total_counts" "total_counts_mt"
## [16] "pct_counts_mt" "doublet_scores" "annotation"
head(srt_dev@meta.data)
## orig.ident nCount_RNA
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 SeuratProject 35134.77
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 SeuratProject 32529.36
## CATACCCAGCACACAG_12006WApool01-N__70_13103 SeuratProject 34500.72
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 SeuratProject 34699.72
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 SeuratProject 31289.23
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 SeuratProject 31620.95
## nFeature_RNA
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 7917
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 7470
## CATACCCAGCACACAG_12006WApool01-N__70_13103 7960
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 7882
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 7629
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 7739
## cell_barcode
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 CGAGAAGCATTCCTAT_12006WApool01-N__70_13103
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 TGCATCCGTTATCTTC_12006WApool01-N__70_13103
## CATACCCAGCACACAG_12006WApool01-N__70_13103 CATACCCAGCACACAG_12006WApool01-N__70_13103
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 GGTTAACGTCATCGGC_12006WApool01-N__70_13103
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103
## sampleid sex age
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 12006WApool01-N__70_13103 M 19pcw
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 12006WApool01-N__70_13103 M 19pcw
## CATACCCAGCACACAG_12006WApool01-N__70_13103 12006WApool01-N__70_13103 M 19pcw
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 12006WApool01-N__70_13103 M 19pcw
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 12006WApool01-N__70_13103 M 19pcw
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 12006WApool01-N__70_13103 M 19pcw
## xbatch xversion seqpool uniq_id
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 10 3 nan HCA_13103
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 10 3 nan HCA_13103
## CATACCCAGCACACAG_12006WApool01-N__70_13103 10 3 nan HCA_13103
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 10 3 nan HCA_13103
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 10 3 nan HCA_13103
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 10 3 nan HCA_13103
## n_genes n_genes_by_counts
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 7918 7917
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 7472 7470
## CATACCCAGCACACAG_12006WApool01-N__70_13103 7960 7960
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 7882 7882
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 7629 7629
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 7739 7739
## total_counts total_counts_mt
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 35134.77 46.00
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 32529.36 51.50
## CATACCCAGCACACAG_12006WApool01-N__70_13103 34500.72 205.96
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 34699.72 137.00
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 31289.23 141.00
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 31620.95 176.00
## pct_counts_mt doublet_scores
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 0.1309244 0.08108108
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 0.1583185 0.04424779
## CATACCCAGCACACAG_12006WApool01-N__70_13103 0.5969730 0.14285714
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 0.3948158 0.10638298
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 0.4506343 0.02276708
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 0.5565931 0.04424779
## annotation
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 U3
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 U3
## CATACCCAGCACACAG_12006WApool01-N__70_13103 U3
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 U3
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 U3
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 U3
DimPlot(srt_dev)
In both objects we need to delete the scaled data, as there are both subsets of bigger datasets. From the adult ones we only keep the BA4 tissue, as it is the most similar to the foetal tissue we’ve got. We also select only the genes in common between both datasets.
Not all the metadata is useful, for example all the different clusterings are not required as they were done with other cells to subset the OPCs.
# Create vector with genes common in both objects
features <- intersect(rownames(srt_old), rownames(srt_dev))
# Subset both objects with only these features
srt_old <- subset(srt_old, features = features)
srt_dev <- subset(srt_dev, features = features)
# Subset metadata of interest from the developmental HCA
keep_metadata_dev <-
c(
"cell_barcode",
"sampleid",
"sex",
"age",
"xbatch",
"xversion",
"seqpool",
"uniq_id",
"n_genes",
"total_counts",
"pct_counts_mt",
"doublet_scores"
)
srt_dev@meta.data <- srt_dev@meta.data[, keep_metadata_dev]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_dev$detected_all_celltypes <- srt_dev$n_genes
srt_dev$total_all_celltypes <- srt_dev$total_counts
srt_dev$n_genes <- NULL
srt_dev$total_counts <- NULL
# Add column that indicates that these are embryos
srt_dev$AdultVsEmbryo <- "embryo"
srt_dev$AgeGroup <- "Embryo"
# Delete the data slot (with counts normalized and scaled) to be sure it
# won't be used, as it is not correct for this subset
srt_dev[["RNA"]]@data <- matrix(
data = NA,
nrow = dim(srt_dev)[1],
ncol = dim(srt_dev)[2],
dimnames = list(rownames(srt_dev), colnames(srt_dev))
)
# only keep the BA4 tissue
srt_old <- subset(srt_old, subset = Tissue == "BA4")
# Subset metadata of interest from the HCA old object
keep_metadata_old <-
c("process_number",
"Barcode",
"detected",
"total",
"caseNO",
"Tissue",
"gender",
"Age",
"BBN",
"PMI",
"CauseOfDeath_category",
"X10XBatch",
"SequencingPool",
"total_percent_mito",
"AgeGroup",
"RIN" ,
"ageSex",
"uniq_id",
"scDblFinder_class",
"scDblFinder_score"
)
srt_old@meta.data <- srt_old@meta.data[, keep_metadata_old]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_old$detected_all_celltypes <- srt_old$detected
srt_old$total_all_celltypes <- srt_old$total
srt_old$detected <- NULL
srt_old$total <- NULL
# Change some names to match the other object naming
srt_old$sex <- srt_old$gender
srt_old$age <- srt_old$Age
srt_old$cell_barcode <- srt_old$Barcode
srt_old$seqpool <- srt_old$SequencingPool
srt_old$xbatch <- srt_old$X10XBatch
srt_old$pct_counts_mt <- srt_old$total_percent_mito
srt_old$doublet_scores <- srt_old$scDblFinder_score
# Delete the previous name
srt_old$gender <- NULL
srt_old$Age <- NULL
srt_old$Barcode <- NULL
srt_old$SequencingPool <- NULL
srt_old$X10XBatch <- NULL
srt_old$total_percent_mito <- NULL
srt_old$scDblFinder_score <- NULL
# Add column that indicates that these are adults
srt_old$AdultVsEmbryo <- "adult"
# Delete the scaled slot as it is not correct for this subset
srt_old[["RNA"]]@scale.data <- as.matrix(NA)
The integration is performed following Seurat’s vignette, that performs the integration as described in (Sturart2019?).
Some changes include:
Setting the k.filter to NA, as suggested in this comment for small datasets. The default is 200, and eventhough the comment was for datasets with less than 200 cells, I feel it is still relevant for this dataset that just contains 457 cells.
k.anchor = 10. These datasets differ so it might benefit from a more “aggressive” integration. Even if it duplicates Seurat’s default this value is still very reasonable. For example the default in fastMNN, from Bioconductor is 20 (even if it is not exactly the same algorithm I think it is comparable).
# Create list with objects
srt_list <- list(srt_dev, srt_old)
# And remove the previous objects to save space
rm(srt_dev, srt_old)
# normalize and identify variable features for each dataset independently
srt_list<- lapply(X = srt_list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
var_features <- SelectIntegrationFeatures(object.list = srt_list)
# create anchors to integrate the datasets
anchors <- FindIntegrationAnchors(object.list = srt_list, anchor.features = var_features, k.anchor = 10, k.filter = NA )
# create an 'integrated' data assay combining srt_dev and srt_old
srt_combined <- IntegrateData(anchorset = anchors)
# specify that we will perform downstream analysis on the corrected data, the original unmodified data still resides in the 'RNA' assay
DefaultAssay(srt_combined) <- "integrated"
Idents(srt_combined) <- srt_combined$AdultVsEmbryo
# Run the standard workflow for visualization and clustering
srt_combined <- ScaleData(srt_combined, verbose = FALSE)
srt_combined <- RunPCA(srt_combined, verbose = FALSE)
# Choose the number of PCs
ElbowPlot(srt_combined, ndims = 50)
# 20
srt_combined <- RunUMAP(srt_combined, reduction = "pca", dims = 1:20)
DimPlot(srt_combined)
saveRDS(srt_combined, here("Processed", "srt_hca_embryos.rds"))