Had an issue with this, they were initially control and treated, but it doesn’t seem like this dataset has those variables, instead it is relapse and dx – Rename cell identities sobj <- sobj %>% RenameIdents(“HTB_1261_DX” = “1” ,“HTB_1261_RL” = “2”

Had issue with vector barcodes as well Subset with vector of cell barcodes sobj %>% subset(cells = c(“AAACGCTGACCAGT-1”, “AAATCATGACCACA-1”, “TTTCGAACACCTGA-2”)) %>% colnames()

sobj <- readRDS("/Users/ianshelton/Downloads/cellar-master/data/HTB_1261_v2.rds")

# Assays are stored as a list in the "assays" slot
sobj@assays
#> $RNA
#> Assay data with 33808 features for 6763 cells
#> First 10 features:
#>  MIR1302-2HG, FAM138A, OR4F5, AL627309.1, AL627309.3, AL627309.2,
#> AL627309.4, AL732372.1, OR4F29, AC114498.1 
#> 
#> $ADT
#> Assay data with 19 features for 6763 cells
#> First 10 features:
#>  CD4.1, CD7.1, CD56, CD3, CD33.1, CD34.1, CD90, CD117, CD45RA, CD123
# The Seurat object stores the current default assay
sobj@active.assay
#> [1] "RNA"
DefaultAssay(sobj)
#> [1] "RNA"


Retrieving raw and normalized counts

The data from each assay is stored as separate Assay objects that are also divided into slots that store the raw and normalized counts along with other downstream results. We can use the GetAssayData function to retrieve the raw and normalized counts matrices.

# Raw counts
sobj@assays$RNA@counts[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>                                
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A     . . . . . . . . . .
#> OR4F5       . . . . . . . . . .
#> AL627309.1  . . . . . . . . . .
#> AL627309.3  . . . . . . . . . .
sobj %>%
  GetAssayData(slot = "counts") %>%
  .[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>                                
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A     . . . . . . . . . .
#> OR4F5       . . . . . . . . . .
#> AL627309.1  . . . . . . . . . .
#> AL627309.3  . . . . . . . . . .
# Normalized counts
# This matrix is the same as counts since we haven't normalized the data yet
sobj@assays$RNA@data[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>                                
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A     . . . . . . . . . .
#> OR4F5       . . . . . . . . . .
#> AL627309.1  . . . . . . . . . .
#> AL627309.3  . . . . . . . . . .
sobj %>%
  GetAssayData(slot = "data") %>%
  .[1:5, 1:10]
#> 5 x 10 sparse Matrix of class "dgCMatrix"
#>                                
#> MIR1302-2HG . . . . . . . . . .
#> FAM138A     . . . . . . . . . .
#> OR4F5       . . . . . . . . . .
#> AL627309.1  . . . . . . . . . .
#> AL627309.3  . . . . . . . . . .


Retrieving cell and features names

Cell barcodes and gene names can be retrieved from the Seurat object using the colnames and rownames functions, respectively. We can run these functions directly on the Seurat object to automatically retrieve information for the default assay.

# Get cell names
sobj@assays$RNA@counts %>%
  colnames() %>%
  head(20)
#>  [1] "DX_AAACCCAAGCGTATAA" "DX_AAACCCACATCATCTT" "DX_AAACCCAGTAGAGCTG"
#>  [4] "DX_AAACGAAAGAGTGACC" "DX_AAACGAATCCTACACC" "DX_AAACGAATCGTAGTCA"
#>  [7] "DX_AAACGAATCTGTTGGA" "DX_AAACGCTGTAACAGGC" "DX_AAACGCTTCGTGGTAT"
#> [10] "DX_AAAGAACAGAAGGTAG" "DX_AAAGAACAGCGTATGG" "DX_AAAGAACCATGCGTGC"
#> [13] "DX_AAAGAACGTACCTAGT" "DX_AAAGAACTCCTAGCCT" "DX_AAAGGATTCCTAGCCT"
#> [16] "DX_AAAGGATTCGATTGGT" "DX_AAAGGATTCGTTAGAC" "DX_AAAGGGCAGCAGCCTC"
#> [19] "DX_AAAGGGCAGGTTATAG" "DX_AAAGGGCAGTCGCTAT"
sobj %>%
  Cells() %>%
  head(20)
#>  [1] "DX_AAACCCAAGCGTATAA" "DX_AAACCCACATCATCTT" "DX_AAACCCAGTAGAGCTG"
#>  [4] "DX_AAACGAAAGAGTGACC" "DX_AAACGAATCCTACACC" "DX_AAACGAATCGTAGTCA"
#>  [7] "DX_AAACGAATCTGTTGGA" "DX_AAACGCTGTAACAGGC" "DX_AAACGCTTCGTGGTAT"
#> [10] "DX_AAAGAACAGAAGGTAG" "DX_AAAGAACAGCGTATGG" "DX_AAAGAACCATGCGTGC"
#> [13] "DX_AAAGAACGTACCTAGT" "DX_AAAGAACTCCTAGCCT" "DX_AAAGGATTCCTAGCCT"
#> [16] "DX_AAAGGATTCGATTGGT" "DX_AAAGGATTCGTTAGAC" "DX_AAAGGGCAGCAGCCTC"
#> [19] "DX_AAAGGGCAGGTTATAG" "DX_AAAGGGCAGTCGCTAT"
sobj %>%
  colnames() %>%
  head(20)
#>  [1] "DX_AAACCCAAGCGTATAA" "DX_AAACCCACATCATCTT" "DX_AAACCCAGTAGAGCTG"
#>  [4] "DX_AAACGAAAGAGTGACC" "DX_AAACGAATCCTACACC" "DX_AAACGAATCGTAGTCA"
#>  [7] "DX_AAACGAATCTGTTGGA" "DX_AAACGCTGTAACAGGC" "DX_AAACGCTTCGTGGTAT"
#> [10] "DX_AAAGAACAGAAGGTAG" "DX_AAAGAACAGCGTATGG" "DX_AAAGAACCATGCGTGC"
#> [13] "DX_AAAGAACGTACCTAGT" "DX_AAAGAACTCCTAGCCT" "DX_AAAGGATTCCTAGCCT"
#> [16] "DX_AAAGGATTCGATTGGT" "DX_AAAGGATTCGTTAGAC" "DX_AAAGGGCAGCAGCCTC"
#> [19] "DX_AAAGGGCAGGTTATAG" "DX_AAAGGGCAGTCGCTAT"
# Get feature names
sobj@assays$RNA@counts %>%
  rownames() %>%
  head(20)
#>  [1] "MIR1302-2HG" "FAM138A"     "OR4F5"       "AL627309.1"  "AL627309.3" 
#>  [6] "AL627309.2"  "AL627309.4"  "AL732372.1"  "OR4F29"      "AC114498.1" 
#> [11] "OR4F16"      "AL669831.2"  "AL669831.5"  "FAM87B"      "LINC00115"  
#> [16] "FAM41C"      "AL645608.6"  "AL645608.2"  "AL645608.4"  "LINC02593"
sobj %>%
  rownames() %>%
  head(20)
#>  [1] "MIR1302-2HG" "FAM138A"     "OR4F5"       "AL627309.1"  "AL627309.3" 
#>  [6] "AL627309.2"  "AL627309.4"  "AL732372.1"  "OR4F29"      "AC114498.1" 
#> [11] "OR4F16"      "AL669831.2"  "AL669831.5"  "FAM87B"      "LINC00115"  
#> [16] "FAM41C"      "AL645608.6"  "AL645608.2"  "AL645608.4"  "LINC02593"
# Get cell (colums) and feature (rows) counts
ncol(sobj)
#> [1] 6763
nrow(sobj)
#> [1] 33808


Accessing project meta.data

The Seurat object includes a data.frame that contains cell meta data for all of the assays present in the Seurat object. This data.frame is stored in the meta.data slot and can be accessed using double brackets ([[]]) or the FetchData function. As we move through our analysis, information will be automatically added to the meta.data. We can also manually add or modify information stored in the meta.data, which is useful when creating custom plots.

Initially the meta.data table will include the number of counts for each cell and the number of genes (or other features) detected for each cell. There is also an orig.ident column which contains the original cell labels (identities). Since we used the names.field argument when creating our Seurat object, this column will include the sample number.

# The meta.data table contains stats for each cell
sobj@meta.data %>%
  head()
#>                      orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT
#> DX_AAACCCAAGCGTATAA HTB_1261_DX       5002         1683       4765           19
#> DX_AAACCCACATCATCTT HTB_1261_DX       7356         2353       2031           19
#> DX_AAACCCAGTAGAGCTG HTB_1261_DX       5419         1754       1799           19
#> DX_AAACGAAAGAGTGACC HTB_1261_DX       6052         2068       2715           19
#> DX_AAACGAATCCTACACC HTB_1261_DX      21348         4442      10084           19
#> DX_AAACGAATCGTAGTCA HTB_1261_DX       8756         2799       2734           19
#>                     percent.mt
#> DX_AAACCCAAGCGTATAA  11.295482
#> DX_AAACCCACATCATCTT  21.574225
#> DX_AAACCCAGTAGAGCTG  24.580181
#> DX_AAACGAAAGAGTGACC  22.769332
#> DX_AAACGAATCCTACACC   9.223346
#> DX_AAACGAATCGTAGTCA  14.721334
sobj[["nCount_RNA"]] %>%
  head()
#>                     nCount_RNA
#> DX_AAACCCAAGCGTATAA       5002
#> DX_AAACCCACATCATCTT       7356
#> DX_AAACCCAGTAGAGCTG       5419
#> DX_AAACGAAAGAGTGACC       6052
#> DX_AAACGAATCCTACACC      21348
#> DX_AAACGAATCGTAGTCA       8756
sobj[["orig.ident"]] %>%
  head()
#>                      orig.ident
#> DX_AAACCCAAGCGTATAA HTB_1261_DX
#> DX_AAACCCACATCATCTT HTB_1261_DX
#> DX_AAACCCAGTAGAGCTG HTB_1261_DX
#> DX_AAACGAAAGAGTGACC HTB_1261_DX
#> DX_AAACGAATCCTACACC HTB_1261_DX
#> DX_AAACGAATCGTAGTCA HTB_1261_DX
sobj %>%
  FetchData(vars = "nCount_RNA") %>%
  head()
#>                     nCount_RNA
#> DX_AAACCCAAGCGTATAA       5002
#> DX_AAACCCACATCATCTT       7356
#> DX_AAACCCAGTAGAGCTG       5419
#> DX_AAACGAAAGAGTGACC       6052
#> DX_AAACGAATCCTACACC      21348
#> DX_AAACGAATCGTAGTCA       8756
# FetchData can also be used to retrieve other data from the Seurat object
sobj %>%
  FetchData(
    vars = "CD8A", 
    slot = "counts"
  ) %>%
  head()
#>                     CD8A
#> DX_AAACCCAAGCGTATAA    0
#> DX_AAACCCACATCATCTT    0
#> DX_AAACCCAGTAGAGCTG    0
#> DX_AAACGAAAGAGTGACC    0
#> DX_AAACGAATCCTACACC    0
#> DX_AAACGAATCGTAGTCA    0


Setting cell identities

The cell identities are used by various functions when grouping and plotting cells. When first creating a Seurat object, the cell identities will be set to the same values present in the orig.ident column in the meta.data table. The current cell identities are stored in the active.ident slot and can be accessed using the Idents function. Cell identities can be renamed using the RenameIdents function. Changing the cell identites will not alter the names stored in the meta.data table or vice versa. To update the meta.data table, we can use the AddMetaData function.

Before # Rename cell identities sobj <- sobj %>% RenameIdents( “1” = “Control” , “2” = “Treated”

# By default the cell identity is set to the project name
sobj@active.ident %>%
  head()
#> DX_AAACCCAAGCGTATAA DX_AAACCCACATCATCTT DX_AAACCCAGTAGAGCTG DX_AAACGAAAGAGTGACC 
#>         HTB_1261_DX         HTB_1261_DX         HTB_1261_DX         HTB_1261_DX 
#> DX_AAACGAATCCTACACC DX_AAACGAATCGTAGTCA 
#>         HTB_1261_DX         HTB_1261_DX 
#> Levels: HTB_1261_DX HTB_1261_RL
sobj %>%
  Idents() %>%
  head()
#> DX_AAACCCAAGCGTATAA DX_AAACCCACATCATCTT DX_AAACCCAGTAGAGCTG DX_AAACGAAAGAGTGACC 
#>         HTB_1261_DX         HTB_1261_DX         HTB_1261_DX         HTB_1261_DX 
#> DX_AAACGAATCCTACACC DX_AAACGAATCGTAGTCA 
#>         HTB_1261_DX         HTB_1261_DX 
#> Levels: HTB_1261_DX HTB_1261_RL
# Rename cell identities
sobj <- sobj %>%
  RenameIdents(
    "HTB_1261_DX" = "1"  ,
    "HTB_1261_RL" = "2"
  )

sobj %>%
  Idents() %>%
  head()
#> DX_AAACCCAAGCGTATAA DX_AAACCCACATCATCTT DX_AAACCCAGTAGAGCTG DX_AAACGAAAGAGTGACC 
#>                   1                   1                   1                   1 
#> DX_AAACGAATCCTACACC DX_AAACGAATCGTAGTCA 
#>                   1                   1 
#> Levels: 1 2
# Add cell identities to the meta.data table
sobj <- sobj %>%
  AddMetaData(
    metadata = Idents(sobj),
    col.name = "orig.ident"
  )

sobj@meta.data %>%
  head()
#>                     orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT
#> DX_AAACCCAAGCGTATAA          1       5002         1683       4765           19
#> DX_AAACCCACATCATCTT          1       7356         2353       2031           19
#> DX_AAACCCAGTAGAGCTG          1       5419         1754       1799           19
#> DX_AAACGAAAGAGTGACC          1       6052         2068       2715           19
#> DX_AAACGAATCCTACACC          1      21348         4442      10084           19
#> DX_AAACGAATCGTAGTCA          1       8756         2799       2734           19
#>                     percent.mt
#> DX_AAACCCAAGCGTATAA  11.295482
#> DX_AAACCCACATCATCTT  21.574225
#> DX_AAACCCAGTAGAGCTG  24.580181
#> DX_AAACGAAAGAGTGACC  22.769332
#> DX_AAACGAATCCTACACC   9.223346
#> DX_AAACGAATCGTAGTCA  14.721334


Subsetting the Seurat object

The cells and features present in the Seurat object can be filtering using the subset function.

# Subset based on gene expression
sobj %>%
  subset(subset = CD8A > 1) %>%
  ncol()
#> [1] 72
# Subset based on cell identity
sobj %>%
  subset(idents = "1") %>%
  ncol()
#> [1] 3077
# Subset with vector of features
sobj %>%
  subset(features = c("CD4", "CD8A")) %>%
  rownames()
#> [1] "CD4"  "CD8A"
# Subset using multiple criteria
sobj %>%
  subset(CD8A > 1, idents = "1") %>%
  ncol()
#> [1] 44
# Subset based on meta.data columns
sobj %>%
  subset(nFeature_RNA > 250)
#> An object of class Seurat 
#> 33827 features across 6570 samples within 2 assays 
#> Active assay: RNA (33808 features, 0 variable features)
#>  1 other assay present: ADT


Other data slots

sobj@reductions
sobj@graphs



Assessing quality

Metrics that are commonly used to assess cell quality include:

A low number of counts, a low number of detected genes, and a high percentage of mitochondrial counts suggests that the cell had a broken membrane and the cytoplasmic mRNA leaked out. Conversely, an abnormally high number of counts and detected genes could indicate the presence of a doublet.


Quantify mitochondrial counts

To calculate the percentage of mitochondrial counts for each cell we can use the PercentageFeatureSet function.

# Calculate percentage of mitochondrial reads for each cell
sobj <- sobj %>%
  PercentageFeatureSet(
    pattern  = "^MT-", 
    col.name = "percent_mito"
  )

sobj@meta.data %>%
  head()
#>                     orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT
#> DX_AAACCCAAGCGTATAA          1       5002         1683       4765           19
#> DX_AAACCCACATCATCTT          1       7356         2353       2031           19
#> DX_AAACCCAGTAGAGCTG          1       5419         1754       1799           19
#> DX_AAACGAAAGAGTGACC          1       6052         2068       2715           19
#> DX_AAACGAATCCTACACC          1      21348         4442      10084           19
#> DX_AAACGAATCGTAGTCA          1       8756         2799       2734           19
#>                     percent.mt percent_mito
#> DX_AAACCCAAGCGTATAA  11.295482    11.295482
#> DX_AAACCCACATCATCTT  21.574225    21.574225
#> DX_AAACCCAGTAGAGCTG  24.580181    24.580181
#> DX_AAACGAAAGAGTGACC  22.769332    22.769332
#> DX_AAACGAATCCTACACC   9.223346     9.223346
#> DX_AAACGAATCGTAGTCA  14.721334    14.721334


EXERCISE: Create violin/box plots comparing cell metrics

# Create violin/boxplots comparing nCount, nFeature, and percent_mito for each sample

# sobj@meta.data %>%


ANSWER


EXERCISE: Create scatter plots comparing cell metrics

# Create a scatter plot comparing nCount and percent_mito

# sobj@meta.data %>%



# Create a scatter plot comparing nCount and nFeature

# sobj@meta.data %>%


ANSWER



Filtering cells

To filter cells we can use the subset function.

# Filter cells based on number of genes and percent mito UMIs
sobj <- sobj %>%
  subset(
    nFeature_RNA > 250 &   # Remove cells with <250 detected genes
    nFeature_RNA < 2500 &  # Remove cells with >2500 detected genes (could be doublets)
    percent_mito < 10      # Remove cells with >10% mitochondrial reads
  )

# Add filtering cutoffs on original scatter plots
p_1 <- p_1 +
  geom_hline(yintercept = 10, linetype = 2)

p_2 <- p_2 +
  geom_hline(yintercept = c(250, 2500), linetype = 2)

CombinePlots(
  plots = list(p_1, p_2), 
  nrow  = 2
)

# How many cells were removed?
ncol(sobj)
#> [1] 949


Saving the Seurat object

dir.create("data", showWarnings = FALSE)

write_rds(sobj, file = "data/filtered_sobj.rds")