Setup

# libraries
library(Seurat) # for scrnaseq analyisis
## Attaching SeuratObject
library(here)   # for reproducible paths
## here() starts at C:/Users/nbestard/OneDrive/Laura

Load objects

srt_old <- readRDS(here("Data", "HCA", "srt_opcs.RDS"))
srt_dev <- readRDS(here("Data", "Embryos", "hca_dev_scanpy_harmony_opcs.rds"))

The seurat adult object contain the sum of spliced and unspliced in the counts slot, and it has been processed with scater QC, scran normalisation, feature selection, scaling, dimensional reduction, annotation.

In the Embryonic dataset the counts slot also contain the summed spliced and unspliced matrices. Sometimes alevin algorithm gives non-integers for counts! The data slot has the normalized and scaled data from scanpy. It includes a pca reductions from harmony where an integration by donor was done.

Explore these objects

Adults

The data slot contains the normalized data, the counts the raw data and the scale the scaled data. There are 84 entries in the metadata.

dim(srt_old$RNA@data)
## [1] 19280  3391
# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_old$RNA@data[10:15,50:53]
## 6 x 4 sparse Matrix of class "dgCMatrix"
##          BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4                                            .        
## C1QTNF12                                        .        
## UBE2J2                                          .        
## ACAP3                                           0.8772232
## INTS11                                          .        
## DVL1                                            .        
##          BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4                                             .       
## C1QTNF12                                         .       
## UBE2J2                                           .       
## ACAP3                                            1.104579
## INTS11                                           .       
## DVL1                                             .       
##          BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .
srt_old$RNA@counts[10:15,50:53]
## 6 x 4 sparse Matrix of class "dgCMatrix"
##          BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   1
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   1
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .
srt_old$RNA@scale.data[10:15,50:53]
##        BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                        -0.4193383
## GRIK3                                        -0.2666249
## HEYL                                         -0.1188837
##        BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                        -0.4193383
## GRIK3                                        -0.2666249
## HEYL                                         -0.1188837
##        BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                         1.1838427
## GRIK3                                         2.3162946
## HEYL                                         -0.1188837
##        BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                        -0.4193383
## GRIK3                                        -0.2666249
## HEYL                                         -0.1188837
colnames(srt_old@meta.data)
##  [1] "orig.ident"                "nCount_RNA"               
##  [3] "nFeature_RNA"              "process_number"           
##  [5] "Barcode"                   "caseNO"                   
##  [7] "Tissue"                    "gender"                   
##  [9] "Age"                       "BBN"                      
## [11] "PMI"                       "CauseOfDeath_category"    
## [13] "CauseOfDeath_1a"           "CauseOfDeath_1b"          
## [15] "CauseOfDeath_1c"           "CauseOfDeath_1d"          
## [17] "Cryostat_date"             "SlideBox"                 
## [19] "Location_Block"            "comments"                 
## [21] "RNAconcRIN"                "RINvalue"                 
## [23] "RINvalueDate"              "RINvalueRemeasured"       
## [25] "rEeXTRACT"                 "X10XBatch"                
## [27] "SequencingPool"            "project_name"             
## [29] "uniq_id"                   "percent.mt"               
## [31] "ident"                     "sum"                      
## [33] "detected"                  "total"                    
## [35] "low_lib_size"              "large_lib_size"           
## [37] "low_n_features"            "high_n_features"          
## [39] "high_subsets_mito_percent" "discard"                  
## [41] "ProcessNumber"             "ScaterQC_failed"          
## [43] "sizeFactor"                "scDblFinder_weighted"     
## [45] "scDblFinder_ratio"         "scDblFinder_class"        
## [47] "scDblFinder_score"         "total_percent_mito"       
## [49] "percent_ribo"              "RNA_snn_res.0.5"          
## [51] "seurat_clusters"           "RNA_snn_res.0.8"          
## [53] "CountsPerCluster_res_0_8"  "IdCountGroup"             
## [55] "samplesPerCluster_res_0_8" "sample_count_per_cl_group"
## [57] "AgeGroup"                  "RIN"                      
## [59] "ageSex"                    "S.Score"                  
## [61] "G2M.Score"                 "Phase"                    
## [63] "old.ident"                 "RNA_snn_res.0.1"          
## [65] "rough_annot"               "sample_qc_failed"         
## [67] "QC_UMI"                    "caseNO_tissue"            
## [69] "clusters_0.8"              "clusters_0.5"             
## [71] "RNA_snn_res.0.9"           "clusters_0.9"             
## [73] "RNA_snn_res.1"             "clusters_1"               
## [75] "RNA_snn_res.1.1"           "clusters_1.1"             
## [77] "RNA_snn_res.1.2"           "clusters_1.2"             
## [79] "RNA_snn_res.1.3"           "clusters_1.3"             
## [81] "RNA_snn_res.1.4"           "clusters_1.4"             
## [83] "RNA_snn_res.1.5"           "clusters_named"
DimPlot(srt_old)

Embryonic

The data slot contains the scaled data and the counts the raw data. There is no normalized non-scaled data. The metadata contains 17 entries

dim(srt_dev$RNA@data)
## [1] 26428   388
dim(srt_dev$RNA@counts)
## [1] 26428   388
dim(srt_dev$RNA@scale.data)
## [1] 0 0
# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_dev$RNA@counts[50:55,1:3]
## 6 x 3 sparse Matrix of class "dgCMatrix"
##          TTATTGCTCAACTGAC_12006WApool01-N__70_13103
## TNFRSF18                                          .
## TNFRSF4                                           .
## SDF4                                              .
## B3GALT6                                           .
## C1QTNF12                                          .
## UBE2J2                                            1
##          GCTACAAAGGTACTGG_12006WApool01-N__70_13103
## TNFRSF18                                          .
## TNFRSF4                                           .
## SDF4                                              .
## B3GALT6                                           1
## C1QTNF12                                          .
## UBE2J2                                            .
##          TTTCACAGTCTCTCCA_12006WApool01-N__70_13103
## TNFRSF18                                          .
## TNFRSF4                                           .
## SDF4                                              .
## B3GALT6                                           .
## C1QTNF12                                          .
## UBE2J2                                            .
srt_dev$RNA@data[50:55,1:3]
##          TTATTGCTCAACTGAC_12006WApool01-N__70_13103
## TNFRSF18                                -0.02794619
## TNFRSF4                                 -0.01033824
## SDF4                                    -0.24113661
## B3GALT6                                 -0.24471466
## C1QTNF12                                -0.01850932
## UBE2J2                                   1.42258894
##          GCTACAAAGGTACTGG_12006WApool01-N__70_13103
## TNFRSF18                                -0.02794619
## TNFRSF4                                 -0.01033824
## SDF4                                    -0.24113661
## B3GALT6                                  2.89962769
## C1QTNF12                                -0.01850932
## UBE2J2                                  -0.39016941
##          TTTCACAGTCTCTCCA_12006WApool01-N__70_13103
## TNFRSF18                                -0.02794619
## TNFRSF4                                 -0.01033824
## SDF4                                    -0.24113661
## B3GALT6                                 -0.24471466
## C1QTNF12                                -0.01850932
## UBE2J2                                  -0.39016941
colnames(srt_dev@meta.data)
##  [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
##  [4] "cell_barcode"      "sampleid"          "gender"           
##  [7] "age"               "xbatch"            "xversion"         
## [10] "sequecingpool"     "uniq_id"           "n_genes"          
## [13] "n_genes_by_counts" "total_counts"      "total_counts_mt"  
## [16] "pct_counts_mt"     "doublet_scores"
head(srt_dev@meta.data)
##                                                   orig.ident nCount_RNA
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103     SeuratProject  11687.043
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103     SeuratProject  11493.330
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103     SeuratProject  12014.238
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 SeuratProject  21195.432
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 SeuratProject  14419.953
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 SeuratProject   8690.293
##                                                nFeature_RNA
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103             4226
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103             4566
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103             4570
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285         6145
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285         5246
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285         3859
##                                                                                  cell_barcode
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103         TTATTGCTCAACTGAC_12006WApool01-N__70_13103
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103         GCTACAAAGGTACTGG_12006WApool01-N__70_13103
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103         TTTCACAGTCTCTCCA_12006WApool01-N__70_13103
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285
##                                                                     sampleid
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103         12006WApool01-N__70_13103
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103         12006WApool01-N__70_13103
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103         12006WApool01-N__70_13103
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285 14771WApool01-N__HCA_71_14285
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285 14771WApool01-N__HCA_71_14285
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285 14771WApool01-N__HCA_71_14285
##                                                gender   age xbatch xversion
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103          M 19pcw     10        3
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103          M 19pcw     10        3
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103          M 19pcw     10        3
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285      F 20pcw     12        3
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285      F 20pcw     12        3
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285      F 20pcw     12        3
##                                                sequecingpool   uniq_id n_genes
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103               nan HCA_13103    4226
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103               nan HCA_13103    4566
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103               nan HCA_13103    4571
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285           nan HCA_14285    6145
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285           nan HCA_14285    5248
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285           nan HCA_14285    3860
##                                                n_genes_by_counts total_counts
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103                  4226    11687.043
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103                  4566    11493.330
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103                  4570    12014.238
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285              6145    21195.432
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285              5246    14419.953
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285              3859     8690.293
##                                                total_counts_mt pct_counts_mt
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103             18.0000     0.1540167
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103            119.0000     1.0353832
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103             84.0000     0.6991705
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285        172.4706     0.8137158
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285         65.0000     0.4507643
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285          9.0000     0.1035638
##                                                doublet_scores
## TTATTGCTCAACTGAC_12006WApool01-N__70_13103         0.03379722
## GCTACAAAGGTACTGG_12006WApool01-N__70_13103         0.07428571
## TTTCACAGTCTCTCCA_12006WApool01-N__70_13103         0.20000000
## GGCACGTCAAATGGTA_14771WApool01-N__HCA_71_14285     0.12903226
## TCGGTCTCACAATGTC_14771WApool01-N__HCA_71_14285     0.35714286
## CGGAGAATCTGGCCGA_14771WApool01-N__HCA_71_14285     0.12903226
DimPlot(srt_dev)

Clean up the objects

In both objects we need to delete the scaled data, as there are both subsets of bigger datasets. We also select only the genes in common between both datasets.

Not all the metadata is usefull, for example all the different clusterings are not required as they were done with other cells to subset the OPCs.

# Create vector with genes common in both objects
features <- intersect(rownames(srt_old), rownames(srt_dev))
# Subset both objects with only these features
srt_old <- subset(srt_old, features = features)
srt_dev <- subset(srt_dev, features = features)
# Subset metadata of interest from the developmental HCA
keep_metadata_dev <-
  c(
    "cell_barcode",
    "sampleid",
    "gender",
    "age",
    "xbatch",
    "xversion",
    "sequecingpool",
    "uniq_id",
    "n_genes",
    "total_counts",
    "pct_counts_mt",
    "doublet_scores"
  )
srt_dev@meta.data <- srt_dev@meta.data[, keep_metadata_dev]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_dev$detected_all_celltypes <- srt_dev$n_genes
srt_dev$total_all_celltypes <- srt_dev$total_counts
srt_dev$n_genes <- NULL
srt_dev$total_counts <- NULL
# Correct name typo
srt_dev$seq_pool <- srt_dev$sequecingpool
# Add column that indicates that these are embryos
srt_dev$AdultVsEmbryo <- "embryo"
srt_dev$AgeGroup <- "Embryo"
# Delete the data slot (with counts normalized and scaled) to be sure it
# won't be used, as it is not correct for this subset
srt_dev[["RNA"]]@data <- matrix(
  data = NA,
  nrow = dim(srt_dev)[1],
  ncol = dim(srt_dev)[2],
  dimnames = list(rownames(srt_dev), colnames(srt_dev))
)

# Subset metadata of interest from the HCA old object
keep_metadata_old <-
  c(
    "process_number",
    "Barcode",
    "detected",
    "total",
    "caseNO",
    "Tissue",
    "gender",
    "Age",
    "BBN",
    "PMI",
    "CauseOfDeath_category",
    "X10XBatch",
    "SequencingPool",
    "total_percent_mito",
    "AgeGroup",
    "RIN" ,
    "ageSex",
    "uniq_id",
    "scDblFinder_class",
    "scDblFinder_score"
  )
srt_old@meta.data <- srt_old@meta.data[, keep_metadata_old]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_old$detected_all_celltypes <- srt_old$detected
srt_old$total_all_celltypes <- srt_old$total
srt_old$detected <- NULL
srt_old$total <- NULL
# Change some names to match the other object naming
srt_old$age <- srt_old$Age
srt_old$cell_barcode <- srt_old$Barcode
srt_old$seq_pool <- srt_old$SequencingPool
srt_old$xbatch <- srt_old$X10XBatch
srt_old$pct_counts_mt <- srt_old$total_percent_mito
srt_old$doublet_scores <- srt_old$scDblFinder_score
# Delete the previous name
srt_old$Age <- NULL
srt_old$Barcode <- NULL
srt_old$SequencingPool <- NULL
srt_old$X10XBatch <- NULL
srt_old$total_percent_mito <- NULL
srt_old$scDblFinder_score <- NULL
# Add column that indicates that these are adults
srt_old$AdultVsEmbryo <- "adult"
# Delete the scaled slot as it is not correct for this subset
srt_old[["RNA"]]@scale.data <- as.matrix(NA)

Integrate the two datasets

The integration is performed following Seurat’s vignette, that performs the integration as described in (Sturart2019?).

Some changes include:

  • Setting the k.filter to NA, as suggested in this comment for small datasets. The default is 200, and eventhough the comment was for datasets with less than 200 cells, I feel it is still relevant for this dataset that just contains 388 features.

  • k.anchor = 10. These datasets differ so it might benefit from a more “aggressive” integration. Even if it duplicates Seurat’s default this value is still very reasonable. For example the default in fastMNN, from Bioconductor is 20 (even if it is not exactly the same algorithm I think it is comparable).

# Create list with objects
srt_list <- list(srt_dev, srt_old)
# And remove the previous objects to save space
rm(srt_dev, srt_old)
# normalize and identify variable features for each dataset independently
srt_list<- lapply(X = srt_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
var_features <- SelectIntegrationFeatures(object.list = srt_list)
# create anchors to integrate the datasets
anchors <- FindIntegrationAnchors(object.list = srt_list, anchor.features = var_features, k.anchor = 10, k.filter = NA )
# create an 'integrated' data assay combining srt_dev and srt_old
srt_combined <- IntegrateData(anchorset = anchors)

Perform an integrated analysis

# specify that we will perform downstream analysis on the corrected data, the original unmodified data still resides in the 'RNA' assay
DefaultAssay(srt_combined) <- "integrated"
Idents(srt_combined) <- srt_combined$AdultVsEmbryo
# Run the standard workflow for visualization and clustering
srt_combined <- ScaleData(srt_combined, verbose = FALSE)
srt_combined <- RunPCA(srt_combined, verbose = FALSE)
# Choose the number of PCs
ElbowPlot(srt_combined, ndims = 50)

# 20
srt_combined <- RunUMAP(srt_combined, reduction = "pca", dims = 1:20)
DimPlot(srt_combined)

#srt_combined <- FindNeighbors(srt_combined, reduction = "pca", dims = 1:20)
#srt_combined <- FindClusters(srt_combined, resolution = 0.5)