sobj <- readRDS("/Users/ianshelton/Downloads/cellar-master/data/HTB_1261_v2.rds")
# Assays are stored as a list in the "assays" slot
sobj@assays
#> $RNA
#> Assay data with 33808 features for 6763 cells
#> First 10 features:
#> MIR1302-2HG, FAM138A, OR4F5, AL627309.1, AL627309.3, AL627309.2,
#> AL627309.4, AL732372.1, OR4F29, AC114498.1
#>
#> $ADT
#> Assay data with 19 features for 6763 cells
#> First 10 features:
#> CD4.1, CD7.1, CD56, CD3, CD33.1, CD34.1, CD90, CD117, CD45RA, CD123
# The Seurat object stores the current default assay
sobj@active.assay
#> [1] "RNA"
DefaultAssay(sobj)
#> [1] "RNA"
The data from each assay is stored as separate Assay objects that are also divided into slots that store the raw and normalized counts along with other downstream results. We can use the GetAssayData function to retrieve the raw and normalized counts matrices.
# Raw counts
sobj@assays$RNA@counts[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A . . . . . . . . . .
#> OR4F5 . . . . . . . . . .
#> AL627309.1 . . . . . . . . . .
#> AL627309.3 . . . . . . . . . .
sobj %>%
GetAssayData(slot = "counts") %>%
.[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A . . . . . . . . . .
#> OR4F5 . . . . . . . . . .
#> AL627309.1 . . . . . . . . . .
#> AL627309.3 . . . . . . . . . .
# Normalized counts
# This matrix is the same as counts since we haven't normalized the data yet
sobj@assays$RNA@data[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A . . . . . . . . . .
#> OR4F5 . . . . . . . . . .
#> AL627309.1 . . . . . . . . . .
#> AL627309.3 . . . . . . . . . .
sobj %>%
GetAssayData(slot = "data") %>%
.[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A . . . . . . . . . .
#> OR4F5 . . . . . . . . . .
#> AL627309.1 . . . . . . . . . .
#> AL627309.3 . . . . . . . . . .
Cell barcodes and gene names can be retrieved from the Seurat object using the colnames and rownames functions, respectively. We can run these functions directly on the Seurat object to automatically retrieve information for the default assay.
# Get cell names
sobj@assays$RNA@counts %>%
colnames() %>%
head(20)
#> [1] "DX_AAACCCAAGCGTATAA" "DX_AAACCCACATCATCTT" "DX_AAACCCAGTAGAGCTG"
#> [4] "DX_AAACGAAAGAGTGACC" "DX_AAACGAATCCTACACC" "DX_AAACGAATCGTAGTCA"
#> [7] "DX_AAACGAATCTGTTGGA" "DX_AAACGCTGTAACAGGC" "DX_AAACGCTTCGTGGTAT"
#> [10] "DX_AAAGAACAGAAGGTAG" "DX_AAAGAACAGCGTATGG" "DX_AAAGAACCATGCGTGC"
#> [13] "DX_AAAGAACGTACCTAGT" "DX_AAAGAACTCCTAGCCT" "DX_AAAGGATTCCTAGCCT"
#> [16] "DX_AAAGGATTCGATTGGT" "DX_AAAGGATTCGTTAGAC" "DX_AAAGGGCAGCAGCCTC"
#> [19] "DX_AAAGGGCAGGTTATAG" "DX_AAAGGGCAGTCGCTAT"
sobj %>%
Cells() %>%
head(20)
#> [1] "DX_AAACCCAAGCGTATAA" "DX_AAACCCACATCATCTT" "DX_AAACCCAGTAGAGCTG"
#> [4] "DX_AAACGAAAGAGTGACC" "DX_AAACGAATCCTACACC" "DX_AAACGAATCGTAGTCA"
#> [7] "DX_AAACGAATCTGTTGGA" "DX_AAACGCTGTAACAGGC" "DX_AAACGCTTCGTGGTAT"
#> [10] "DX_AAAGAACAGAAGGTAG" "DX_AAAGAACAGCGTATGG" "DX_AAAGAACCATGCGTGC"
#> [13] "DX_AAAGAACGTACCTAGT" "DX_AAAGAACTCCTAGCCT" "DX_AAAGGATTCCTAGCCT"
#> [16] "DX_AAAGGATTCGATTGGT" "DX_AAAGGATTCGTTAGAC" "DX_AAAGGGCAGCAGCCTC"
#> [19] "DX_AAAGGGCAGGTTATAG" "DX_AAAGGGCAGTCGCTAT"
sobj %>%
colnames() %>%
head(20)
#> [1] "DX_AAACCCAAGCGTATAA" "DX_AAACCCACATCATCTT" "DX_AAACCCAGTAGAGCTG"
#> [4] "DX_AAACGAAAGAGTGACC" "DX_AAACGAATCCTACACC" "DX_AAACGAATCGTAGTCA"
#> [7] "DX_AAACGAATCTGTTGGA" "DX_AAACGCTGTAACAGGC" "DX_AAACGCTTCGTGGTAT"
#> [10] "DX_AAAGAACAGAAGGTAG" "DX_AAAGAACAGCGTATGG" "DX_AAAGAACCATGCGTGC"
#> [13] "DX_AAAGAACGTACCTAGT" "DX_AAAGAACTCCTAGCCT" "DX_AAAGGATTCCTAGCCT"
#> [16] "DX_AAAGGATTCGATTGGT" "DX_AAAGGATTCGTTAGAC" "DX_AAAGGGCAGCAGCCTC"
#> [19] "DX_AAAGGGCAGGTTATAG" "DX_AAAGGGCAGTCGCTAT"
# Get feature names
sobj@assays$RNA@counts %>%
rownames() %>%
head(20)
#> [1] "MIR1302-2HG" "FAM138A" "OR4F5" "AL627309.1" "AL627309.3"
#> [6] "AL627309.2" "AL627309.4" "AL732372.1" "OR4F29" "AC114498.1"
#> [11] "OR4F16" "AL669831.2" "AL669831.5" "FAM87B" "LINC00115"
#> [16] "FAM41C" "AL645608.6" "AL645608.2" "AL645608.4" "LINC02593"
sobj %>%
rownames() %>%
head(20)
#> [1] "MIR1302-2HG" "FAM138A" "OR4F5" "AL627309.1" "AL627309.3"
#> [6] "AL627309.2" "AL627309.4" "AL732372.1" "OR4F29" "AC114498.1"
#> [11] "OR4F16" "AL669831.2" "AL669831.5" "FAM87B" "LINC00115"
#> [16] "FAM41C" "AL645608.6" "AL645608.2" "AL645608.4" "LINC02593"
# Get cell (colums) and feature (rows) counts
ncol(sobj)
#> [1] 6763
nrow(sobj)
#> [1] 33808
The Seurat object includes a data.frame that contains cell meta data for all of the assays present in the Seurat object. This data.frame is stored in the meta.data slot and can be accessed using double brackets ([[]]) or the FetchData function. As we move through our analysis, information will be automatically added to the meta.data. We can also manually add or modify information stored in the meta.data, which is useful when creating custom plots.
Initially the meta.data table will include the number of counts for each cell and the number of genes (or other features) detected for each cell. There is also an orig.ident column which contains the original cell labels (identities). Since we used the names.field argument when creating our Seurat object, this column will include the sample number.
# The meta.data table contains stats for each cell
sobj@meta.data %>%
head()
#> orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT
#> DX_AAACCCAAGCGTATAA HTB_1261_DX 5002 1683 4765 19
#> DX_AAACCCACATCATCTT HTB_1261_DX 7356 2353 2031 19
#> DX_AAACCCAGTAGAGCTG HTB_1261_DX 5419 1754 1799 19
#> DX_AAACGAAAGAGTGACC HTB_1261_DX 6052 2068 2715 19
#> DX_AAACGAATCCTACACC HTB_1261_DX 21348 4442 10084 19
#> DX_AAACGAATCGTAGTCA HTB_1261_DX 8756 2799 2734 19
#> percent.mt
#> DX_AAACCCAAGCGTATAA 11.295482
#> DX_AAACCCACATCATCTT 21.574225
#> DX_AAACCCAGTAGAGCTG 24.580181
#> DX_AAACGAAAGAGTGACC 22.769332
#> DX_AAACGAATCCTACACC 9.223346
#> DX_AAACGAATCGTAGTCA 14.721334
sobj[["nCount_RNA"]] %>%
head()
#> nCount_RNA
#> DX_AAACCCAAGCGTATAA 5002
#> DX_AAACCCACATCATCTT 7356
#> DX_AAACCCAGTAGAGCTG 5419
#> DX_AAACGAAAGAGTGACC 6052
#> DX_AAACGAATCCTACACC 21348
#> DX_AAACGAATCGTAGTCA 8756
sobj[["orig.ident"]] %>%
head()
#> orig.ident
#> DX_AAACCCAAGCGTATAA HTB_1261_DX
#> DX_AAACCCACATCATCTT HTB_1261_DX
#> DX_AAACCCAGTAGAGCTG HTB_1261_DX
#> DX_AAACGAAAGAGTGACC HTB_1261_DX
#> DX_AAACGAATCCTACACC HTB_1261_DX
#> DX_AAACGAATCGTAGTCA HTB_1261_DX
sobj %>%
FetchData(vars = "nCount_RNA") %>%
head()
#> nCount_RNA
#> DX_AAACCCAAGCGTATAA 5002
#> DX_AAACCCACATCATCTT 7356
#> DX_AAACCCAGTAGAGCTG 5419
#> DX_AAACGAAAGAGTGACC 6052
#> DX_AAACGAATCCTACACC 21348
#> DX_AAACGAATCGTAGTCA 8756
# FetchData can also be used to retrieve other data from the Seurat object
sobj %>%
FetchData(
vars = "CD8A",
slot = "counts"
) %>%
head()
#> CD8A
#> DX_AAACCCAAGCGTATAA 0
#> DX_AAACCCACATCATCTT 0
#> DX_AAACCCAGTAGAGCTG 0
#> DX_AAACGAAAGAGTGACC 0
#> DX_AAACGAATCCTACACC 0
#> DX_AAACGAATCGTAGTCA 0
The cell identities are used by various functions when grouping and plotting cells. When first creating a Seurat object, the cell identities will be set to the same values present in the orig.ident column in the meta.data table. The current cell identities are stored in the active.ident slot and can be accessed using the Idents function. Cell identities can be renamed using the RenameIdents function. Changing the cell identites will not alter the names stored in the meta.data table or vice versa. To update the meta.data table, we can use the AddMetaData function.
Before # Rename cell identities sobj <- sobj %>% RenameIdents( “1” = “Control” , “2” = “Treated”
# By default the cell identity is set to the project name
sobj@active.ident %>%
head()
#> DX_AAACCCAAGCGTATAA DX_AAACCCACATCATCTT DX_AAACCCAGTAGAGCTG DX_AAACGAAAGAGTGACC
#> HTB_1261_DX HTB_1261_DX HTB_1261_DX HTB_1261_DX
#> DX_AAACGAATCCTACACC DX_AAACGAATCGTAGTCA
#> HTB_1261_DX HTB_1261_DX
#> Levels: HTB_1261_DX HTB_1261_RL
sobj %>%
Idents() %>%
head()
#> DX_AAACCCAAGCGTATAA DX_AAACCCACATCATCTT DX_AAACCCAGTAGAGCTG DX_AAACGAAAGAGTGACC
#> HTB_1261_DX HTB_1261_DX HTB_1261_DX HTB_1261_DX
#> DX_AAACGAATCCTACACC DX_AAACGAATCGTAGTCA
#> HTB_1261_DX HTB_1261_DX
#> Levels: HTB_1261_DX HTB_1261_RL
# Rename cell identities
sobj <- sobj %>%
RenameIdents(
"HTB_1261_DX" = "1" ,
"HTB_1261_RL" = "2"
)
sobj %>%
Idents() %>%
head()
#> DX_AAACCCAAGCGTATAA DX_AAACCCACATCATCTT DX_AAACCCAGTAGAGCTG DX_AAACGAAAGAGTGACC
#> 1 1 1 1
#> DX_AAACGAATCCTACACC DX_AAACGAATCGTAGTCA
#> 1 1
#> Levels: 1 2
# Add cell identities to the meta.data table
sobj <- sobj %>%
AddMetaData(
metadata = Idents(sobj),
col.name = "orig.ident"
)
sobj@meta.data %>%
head()
#> orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT
#> DX_AAACCCAAGCGTATAA 1 5002 1683 4765 19
#> DX_AAACCCACATCATCTT 1 7356 2353 2031 19
#> DX_AAACCCAGTAGAGCTG 1 5419 1754 1799 19
#> DX_AAACGAAAGAGTGACC 1 6052 2068 2715 19
#> DX_AAACGAATCCTACACC 1 21348 4442 10084 19
#> DX_AAACGAATCGTAGTCA 1 8756 2799 2734 19
#> percent.mt
#> DX_AAACCCAAGCGTATAA 11.295482
#> DX_AAACCCACATCATCTT 21.574225
#> DX_AAACCCAGTAGAGCTG 24.580181
#> DX_AAACGAAAGAGTGACC 22.769332
#> DX_AAACGAATCCTACACC 9.223346
#> DX_AAACGAATCGTAGTCA 14.721334
The cells and features present in the Seurat object can be filtering using the subset function.
# Subset based on gene expression
sobj %>%
subset(subset = CD8A > 1) %>%
ncol()
#> [1] 72
# Subset based on cell identity
sobj %>%
subset(idents = "1") %>%
ncol()
#> [1] 3077
# Subset with vector of features
sobj %>%
subset(features = c("CD4", "CD8A")) %>%
rownames()
#> [1] "CD4" "CD8A"
# Subset using multiple criteria
sobj %>%
subset(CD8A > 1, idents = "1") %>%
ncol()
#> [1] 44
# Subset based on meta.data columns
sobj %>%
subset(nFeature_RNA > 250)
#> An object of class Seurat
#> 33827 features across 6570 samples within 2 assays
#> Active assay: RNA (33808 features, 0 variable features)
#> 1 other assay present: ADT
sobj@reductions
sobj@graphs
Metrics that are commonly used to assess cell quality include:
A low number of counts, a low number of detected genes, and a high percentage of mitochondrial counts suggests that the cell had a broken membrane and the cytoplasmic mRNA leaked out. Conversely, an abnormally high number of counts and detected genes could indicate the presence of a doublet.
To calculate the percentage of mitochondrial counts for each cell we can use the PercentageFeatureSet function.
# Calculate percentage of mitochondrial reads for each cell
sobj <- sobj %>%
PercentageFeatureSet(
pattern = "^MT-",
col.name = "percent_mito"
)
sobj@meta.data %>%
head()
#> orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT
#> DX_AAACCCAAGCGTATAA 1 5002 1683 4765 19
#> DX_AAACCCACATCATCTT 1 7356 2353 2031 19
#> DX_AAACCCAGTAGAGCTG 1 5419 1754 1799 19
#> DX_AAACGAAAGAGTGACC 1 6052 2068 2715 19
#> DX_AAACGAATCCTACACC 1 21348 4442 10084 19
#> DX_AAACGAATCGTAGTCA 1 8756 2799 2734 19
#> percent.mt percent_mito
#> DX_AAACCCAAGCGTATAA 11.295482 11.295482
#> DX_AAACCCACATCATCTT 21.574225 21.574225
#> DX_AAACCCAGTAGAGCTG 24.580181 24.580181
#> DX_AAACGAAAGAGTGACC 22.769332 22.769332
#> DX_AAACGAATCCTACACC 9.223346 9.223346
#> DX_AAACGAATCGTAGTCA 14.721334 14.721334
# Create violin/boxplots comparing nCount, nFeature, and percent_mito for each sample
# sobj@meta.data %>%
# Create a scatter plot comparing nCount and percent_mito
# sobj@meta.data %>%
# Create a scatter plot comparing nCount and nFeature
# sobj@meta.data %>%
To filter cells we can use the subset function.
# Filter cells based on number of genes and percent mito UMIs
sobj <- sobj %>%
subset(
nFeature_RNA > 250 & # Remove cells with <250 detected genes
nFeature_RNA < 2500 & # Remove cells with >2500 detected genes (could be doublets)
percent_mito < 10 # Remove cells with >10% mitochondrial reads
)
# Add filtering cutoffs on original scatter plots
p_1 <- p_1 +
geom_hline(yintercept = 10, linetype = 2)
p_2 <- p_2 +
geom_hline(yintercept = c(250, 2500), linetype = 2)
CombinePlots(
plots = list(p_1, p_2),
nrow = 2
)
# How many cells were removed?
ncol(sobj)
#> [1] 949
dir.create("data", showWarnings = FALSE)
write_rds(sobj, file = "data/filtered_sobj.rds")