Explore these objects

Adults

The data slot contains the normalized data, the counts the raw data and the scale the scaled data. There are 84 entries in the metadata.

dim(srt_old$RNA@data)

## [1] 19280  3391

# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_old$RNA@data[10:15,50:53]

## 6 x 4 sparse Matrix of class "dgCMatrix"
##          BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4                                            .        
## C1QTNF12                                        .        
## UBE2J2                                          .        
## ACAP3                                           0.8772232
## INTS11                                          .        
## DVL1                                            .        
##          BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4                                             .       
## C1QTNF12                                         .       
## UBE2J2                                           .       
## ACAP3                                            1.104579
## INTS11                                           .       
## DVL1                                             .       
##          BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .

srt_old$RNA@counts[10:15,50:53]

## 6 x 4 sparse Matrix of class "dgCMatrix"
##          BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   1
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   1
## INTS11                                                  .
## DVL1                                                    .
##          BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## SDF4                                                    .
## C1QTNF12                                                .
## UBE2J2                                                  .
## ACAP3                                                   .
## INTS11                                                  .
## DVL1                                                    .

srt_old$RNA@scale.data[10:15,50:53]

##        BA4_possorted_genome_bam_A624Z:TCAATCTAGTGATGGCx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                        -0.4193383
## GRIK3                                        -0.2666249
## HEYL                                         -0.1188837
##        BA4_possorted_genome_bam_A624Z:TCACTCGCAACTGAAAx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                        -0.4193383
## GRIK3                                        -0.2666249
## HEYL                                         -0.1188837
##        BA4_possorted_genome_bam_A624Z:TCGACGGCACTCTCGTx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                         1.1838427
## GRIK3                                         2.3162946
## HEYL                                         -0.1188837
##        BA4_possorted_genome_bam_A624Z:TCCATCGGTGTTGACTx
## RUNX3                                        -0.1095590
## IFI6                                         -0.3099379
## LAPTM5                                       -0.2745048
## CSMD2                                        -0.4193383
## GRIK3                                        -0.2666249
## HEYL                                         -0.1188837

colnames(srt_old@meta.data)

##  [1] "orig.ident"                "nCount_RNA"               
##  [3] "nFeature_RNA"              "process_number"           
##  [5] "Barcode"                   "caseNO"                   
##  [7] "Tissue"                    "gender"                   
##  [9] "Age"                       "BBN"                      
## [11] "PMI"                       "CauseOfDeath_category"    
## [13] "CauseOfDeath_1a"           "CauseOfDeath_1b"          
## [15] "CauseOfDeath_1c"           "CauseOfDeath_1d"          
## [17] "Cryostat_date"             "SlideBox"                 
## [19] "Location_Block"            "comments"                 
## [21] "RNAconcRIN"                "RINvalue"                 
## [23] "RINvalueDate"              "RINvalueRemeasured"       
## [25] "rEeXTRACT"                 "X10XBatch"                
## [27] "SequencingPool"            "project_name"             
## [29] "uniq_id"                   "percent.mt"               
## [31] "ident"                     "sum"                      
## [33] "detected"                  "total"                    
## [35] "low_lib_size"              "large_lib_size"           
## [37] "low_n_features"            "high_n_features"          
## [39] "high_subsets_mito_percent" "discard"                  
## [41] "ProcessNumber"             "ScaterQC_failed"          
## [43] "sizeFactor"                "scDblFinder_weighted"     
## [45] "scDblFinder_ratio"         "scDblFinder_class"        
## [47] "scDblFinder_score"         "total_percent_mito"       
## [49] "percent_ribo"              "RNA_snn_res.0.5"          
## [51] "seurat_clusters"           "RNA_snn_res.0.8"          
## [53] "CountsPerCluster_res_0_8"  "IdCountGroup"             
## [55] "samplesPerCluster_res_0_8" "sample_count_per_cl_group"
## [57] "AgeGroup"                  "RIN"                      
## [59] "ageSex"                    "S.Score"                  
## [61] "G2M.Score"                 "Phase"                    
## [63] "old.ident"                 "RNA_snn_res.0.1"          
## [65] "rough_annot"               "sample_qc_failed"         
## [67] "QC_UMI"                    "caseNO_tissue"            
## [69] "clusters_0.8"              "clusters_0.5"             
## [71] "RNA_snn_res.0.9"           "clusters_0.9"             
## [73] "RNA_snn_res.1"             "clusters_1"               
## [75] "RNA_snn_res.1.1"           "clusters_1.1"             
## [77] "RNA_snn_res.1.2"           "clusters_1.2"             
## [79] "RNA_snn_res.1.3"           "clusters_1.3"             
## [81] "RNA_snn_res.1.4"           "clusters_1.4"             
## [83] "RNA_snn_res.1.5"           "clusters_named"

DimPlot(srt_old)

Foetal

The data slot contains the scaled data and the counts the raw data. There is no normalized non-scaled data. The metadata contains 17 entries

dim(srt_dev$RNA@data)

## [1] 26428   457

dim(srt_dev$RNA@counts)

## [1] 26428   457

dim(srt_dev$RNA@scale.data)

## [1] 0 0

# Select a subset of the data to display to see what kind of data
# is stored in each slot
srt_dev$RNA@counts[50:55,1:3]

## 6 x 3 sparse Matrix of class "dgCMatrix"
##          CGAGAAGCATTCCTAT_12006WApool01-N__70_13103
## TNFRSF18                                          .
## TNFRSF4                                           .
## SDF4                                              .
## B3GALT6                                           1
## C1QTNF12                                          .
## UBE2J2                                            .
##          TGCATCCGTTATCTTC_12006WApool01-N__70_13103
## TNFRSF18                                          .
## TNFRSF4                                           .
## SDF4                                              1
## B3GALT6                                           .
## C1QTNF12                                          .
## UBE2J2                                            2
##          CATACCCAGCACACAG_12006WApool01-N__70_13103
## TNFRSF18                                          .
## TNFRSF4                                           .
## SDF4                                              4
## B3GALT6                                           3
## C1QTNF12                                          .
## UBE2J2                                            4

srt_dev$RNA@data[50:55,1:3]

## 6 x 3 sparse Matrix of class "dgCMatrix"
##          CGAGAAGCATTCCTAT_12006WApool01-N__70_13103
## TNFRSF18                                  .        
## TNFRSF4                                   .        
## SDF4                                      .        
## B3GALT6                                   0.2504616
## C1QTNF12                                  .        
## UBE2J2                                    .        
##          TGCATCCGTTATCTTC_12006WApool01-N__70_13103
## TNFRSF18                                  .        
## TNFRSF4                                   .        
## SDF4                                      0.2680517
## B3GALT6                                   .        
## C1QTNF12                                  .        
## UBE2J2                                    0.4792293
##          CATACCCAGCACACAG_12006WApool01-N__70_13103
## TNFRSF18                                  .        
## TNFRSF4                                   .        
## SDF4                                      0.7698283
## B3GALT6                                   0.6256959
## C1QTNF12                                  .        
## UBE2J2                                    0.7698283

colnames(srt_dev@meta.data)

##  [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
##  [4] "cell_barcode"      "sampleid"          "sex"              
##  [7] "age"               "xbatch"            "xversion"         
## [10] "seqpool"           "uniq_id"           "n_genes"          
## [13] "n_genes_by_counts" "total_counts"      "total_counts_mt"  
## [16] "pct_counts_mt"     "doublet_scores"    "annotation"

head(srt_dev@meta.data)

##                                               orig.ident nCount_RNA
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 SeuratProject   35134.77
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 SeuratProject   32529.36
## CATACCCAGCACACAG_12006WApool01-N__70_13103 SeuratProject   34500.72
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 SeuratProject   34699.72
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 SeuratProject   31289.23
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 SeuratProject   31620.95
##                                            nFeature_RNA
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103         7917
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103         7470
## CATACCCAGCACACAG_12006WApool01-N__70_13103         7960
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103         7882
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103         7629
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103         7739
##                                                                          cell_barcode
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 CGAGAAGCATTCCTAT_12006WApool01-N__70_13103
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 TGCATCCGTTATCTTC_12006WApool01-N__70_13103
## CATACCCAGCACACAG_12006WApool01-N__70_13103 CATACCCAGCACACAG_12006WApool01-N__70_13103
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 GGTTAACGTCATCGGC_12006WApool01-N__70_13103
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103
##                                                             sampleid sex   age
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103 12006WApool01-N__70_13103   M 19pcw
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103 12006WApool01-N__70_13103   M 19pcw
## CATACCCAGCACACAG_12006WApool01-N__70_13103 12006WApool01-N__70_13103   M 19pcw
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103 12006WApool01-N__70_13103   M 19pcw
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103 12006WApool01-N__70_13103   M 19pcw
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103 12006WApool01-N__70_13103   M 19pcw
##                                            xbatch xversion seqpool   uniq_id
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103     10        3     nan HCA_13103
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103     10        3     nan HCA_13103
## CATACCCAGCACACAG_12006WApool01-N__70_13103     10        3     nan HCA_13103
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103     10        3     nan HCA_13103
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103     10        3     nan HCA_13103
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103     10        3     nan HCA_13103
##                                            n_genes n_genes_by_counts
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103    7918              7917
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103    7472              7470
## CATACCCAGCACACAG_12006WApool01-N__70_13103    7960              7960
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103    7882              7882
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103    7629              7629
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103    7739              7739
##                                            total_counts total_counts_mt
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103     35134.77           46.00
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103     32529.36           51.50
## CATACCCAGCACACAG_12006WApool01-N__70_13103     34500.72          205.96
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103     34699.72          137.00
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103     31289.23          141.00
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103     31620.95          176.00
##                                            pct_counts_mt doublet_scores
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103     0.1309244     0.08108108
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103     0.1583185     0.04424779
## CATACCCAGCACACAG_12006WApool01-N__70_13103     0.5969730     0.14285714
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103     0.3948158     0.10638298
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103     0.4506343     0.02276708
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103     0.5565931     0.04424779
##                                            annotation
## CGAGAAGCATTCCTAT_12006WApool01-N__70_13103         U3
## TGCATCCGTTATCTTC_12006WApool01-N__70_13103         U3
## CATACCCAGCACACAG_12006WApool01-N__70_13103         U3
## GGTTAACGTCATCGGC_12006WApool01-N__70_13103         U3
## AGGCTGCAGGTGCTTT_12006WApool01-N__70_13103         U3
## TGCCGAGTCCCAGGAC_12006WApool01-N__70_13103         U3

DimPlot(srt_dev)

Clean up the objects

In both objects we need to delete the scaled data, as there are both subsets of bigger datasets. From the adult ones we only keep the BA4 tissue, as it is the most similar to the foetal tissue we’ve got. We also select only the genes in common between both datasets.

Not all the metadata is useful, for example all the different clusterings are not required as they were done with other cells to subset the OPCs.

# Create vector with genes common in both objects
features <- intersect(rownames(srt_old), rownames(srt_dev))
# Subset both objects with only these features
srt_old <- subset(srt_old, features = features)
srt_dev <- subset(srt_dev, features = features)
# Subset metadata of interest from the developmental HCA
keep_metadata_dev <-
  c(
    "cell_barcode",
    "sampleid",
    "sex",
    "age",
    "xbatch",
    "xversion",
    "seqpool",
    "uniq_id",
    "n_genes",
    "total_counts",
    "pct_counts_mt",
    "doublet_scores"
  )
srt_dev@meta.data <- srt_dev@meta.data[, keep_metadata_dev]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_dev$detected_all_celltypes <- srt_dev$n_genes
srt_dev$total_all_celltypes <- srt_dev$total_counts
srt_dev$n_genes <- NULL
srt_dev$total_counts <- NULL
# Add column that indicates that these are embryos
srt_dev$AdultVsEmbryo <- "embryo"
srt_dev$AgeGroup <- "Embryo"
# Delete the data slot (with counts normalized and scaled) to be sure it
# won't be used, as it is not correct for this subset
srt_dev[["RNA"]]@data <- matrix(
  data = NA,
  nrow = dim(srt_dev)[1],
  ncol = dim(srt_dev)[2],
  dimnames = list(rownames(srt_dev), colnames(srt_dev))
)
# only keep the BA4 tissue
srt_old <- subset(srt_old, subset = Tissue == "BA4")
# Subset metadata of interest from the HCA old object
keep_metadata_old <-
  c("process_number",
    "Barcode",
    "detected",
    "total",
    "caseNO",
    "Tissue",
    "gender",
    "Age",
    "BBN",
    "PMI",
    "CauseOfDeath_category",
    "X10XBatch",
    "SequencingPool",
    "total_percent_mito",
    "AgeGroup",
    "RIN" ,
    "ageSex",
    "uniq_id",
    "scDblFinder_class",
    "scDblFinder_score"
  )
srt_old@meta.data <- srt_old@meta.data[, keep_metadata_old]
# Indicate that the counts and features are from the previous object with all the
# celltypes
srt_old$detected_all_celltypes <- srt_old$detected
srt_old$total_all_celltypes <- srt_old$total
srt_old$detected <- NULL
srt_old$total <- NULL
# Change some names to match the other object naming
srt_old$sex <- srt_old$gender
srt_old$age <- srt_old$Age
srt_old$cell_barcode <- srt_old$Barcode
srt_old$seqpool <- srt_old$SequencingPool
srt_old$xbatch <- srt_old$X10XBatch
srt_old$pct_counts_mt <- srt_old$total_percent_mito
srt_old$doublet_scores <- srt_old$scDblFinder_score
# Delete the previous name
srt_old$gender <- NULL
srt_old$Age <- NULL
srt_old$Barcode <- NULL
srt_old$SequencingPool <- NULL
srt_old$X10XBatch <- NULL
srt_old$total_percent_mito <- NULL
srt_old$scDblFinder_score <- NULL
# Add column that indicates that these are adults
srt_old$AdultVsEmbryo <- "adult"
# Delete the scaled slot as it is not correct for this subset
srt_old[["RNA"]]@scale.data <- as.matrix(NA)

Integrate the two datasets

The integration is performed following Seurat’s vignette, that performs the integration as described in (Sturart2019?).

Some changes include:

Setting the k.filter to NA, as suggested in this comment for small datasets. The default is 200, and eventhough the comment was for datasets with less than 200 cells, I feel it is still relevant for this dataset that just contains 457 cells.
k.anchor = 10. These datasets differ so it might benefit from a more “aggressive” integration. Even if it duplicates Seurat’s default this value is still very reasonable. For example the default in fastMNN, from Bioconductor is 20 (even if it is not exactly the same algorithm I think it is comparable).

# Create list with objects
srt_list <- list(srt_dev, srt_old)
# And remove the previous objects to save space
rm(srt_dev, srt_old)
# normalize and identify variable features for each dataset independently
srt_list<- lapply(X = srt_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
var_features <- SelectIntegrationFeatures(object.list = srt_list)
# create anchors to integrate the datasets
anchors <- FindIntegrationAnchors(object.list = srt_list, anchor.features = var_features, k.anchor = 10, k.filter = NA )
# create an 'integrated' data assay combining srt_dev and srt_old
srt_combined <- IntegrateData(anchorset = anchors)

Perform an integrated analysis

# specify that we will perform downstream analysis on the corrected data, the original unmodified data still resides in the 'RNA' assay
DefaultAssay(srt_combined) <- "integrated"
Idents(srt_combined) <- srt_combined$AdultVsEmbryo
# Run the standard workflow for visualization and clustering
srt_combined <- ScaleData(srt_combined, verbose = FALSE)
srt_combined <- RunPCA(srt_combined, verbose = FALSE)
# Choose the number of PCs
ElbowPlot(srt_combined, ndims = 50)

# 20
srt_combined <- RunUMAP(srt_combined, reduction = "pca", dims = 1:20)
DimPlot(srt_combined)

saveRDS(srt_combined, here("Processed", "srt_hca_embryos.rds"))

Compare adult and Embryo OPCs

Nadine Bestard

23/02/2021