Single nucleus

Processing of data from “Human and mouse single-nucleus transcriptomics reveal TREM2-dependent and TREM2-independent cellular responses in Alzheimer’s disease” Zhou et al. 2020
https://www.nature.com/articles/s41591-019-0695-9#data-availability

Set working directory

Load libraries

Set variables and thresholds

pathToRef <- "/research/labs/neurology/fryer/projects/references/mouse/refdata-gex-mm10-2020-A"
pathToTestType <- "15months/"
treatment <- "mouse"
tissue <- "Brain"
seqType <- "nucleus"
tool <- "cellranger"

# brain_region groups
brain_region <- "all"
brain_region1 <- "Cor"
brain_region2 <- "Hip"

# genotype
WT <- "WT"
Trem2_KO <- "Trem2_KO"
WT_5XFAD <- "WT_5XFAD"
Trem2_KO_5XFAD <- "Trem2_KO_5XFAD"
sample_order <- c(
      "WT_Cor",
      "Trem2_KO_Cor",
      "WT_5XFAD_Cor",
      "Trem2_KO_5XFAD_Cor",
      "WT_Hip",
      "Trem2_KO_Hip",
      "WT_5XFAD_Hip",
      "Trem2_KO_5XFAD_Hip"
)

QC thresholds as defined by the paper

nCount.min <- 300
nCount.max <- 9000
nFeature.min <- 300
nFeature.max <- 5600
mt.cutoff <- 5

Setup seurat object

Using CellBender filtered output.

prefix <- "../../counts/15months/"

if (tissue == "Brain" && file.exists(paste0("../../rObjects/", 
                        pathToTestType, "merged.rds"))) {
  data <- readRDS(paste0("../../rObjects/", 
                        pathToTestType, "merged.rds"))
} else if (tissue == "Brain") {
  # individual sample objects
  WT_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160643_WT_Cor/")))
  Trem2_KO_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160644_Trem2_KO_Cor/")))
  WT_5XFAD_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160645_WT_5XFAD_Cor/")))
  Trem2_KO_5XFAD_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160646_Trem2_KO_5XFAD_Cor/")))
  WT_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160647_WT_Hip/")))
  Trem2_KO_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160648_Trem2_KO_Hip/")))
  WT_5XFAD_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160649_WT_5XFAD_Hip/")))
  Trem2_KO_5XFAD_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160650_Trem2_KO_5XFAD_Hip/")))

  # merge objects
  data <- merge(
    x = WT_Cor,
    y = c(
      Trem2_KO_Cor,
      WT_5XFAD_Cor,
      Trem2_KO_5XFAD_Cor,
      WT_Hip,
      Trem2_KO_Hip,
      WT_5XFAD_Hip,
      Trem2_KO_5XFAD_Hip
    ),
    add.cell.ids = c(
      "WT_Cor",
      "Trem2_KO_Cor",
      "WT_5XFAD_Cor",
      "Trem2_KO_5XFAD_Cor",
      "WT_Hip",
      "Trem2_KO_Hip",
      "WT_5XFAD_Hip",
      "Trem2_KO_5XFAD_Hip"
    ),
    project = paste0("15months", brain_region, " - Single Nucleus")
  )
  
  # cleanup and save
  remove(
      WT_Cor,
      Trem2_KO_Cor,
      WT_5XFAD_Cor,
      Trem2_KO_5XFAD_Cor,
      WT_Hip,
      Trem2_KO_Hip,
      WT_5XFAD_Hip,
      Trem2_KO_5XFAD_Hip
  )
  saveRDS(data,
          paste0("../../rObjects/", pathToTestType, "merged.rds"))
}

# preview
data
## An object of class Seurat 
## 27998 features across 42520 samples within 1 assay 
## Active assay: RNA (27998 features, 0 variable features)

nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique genes (features)

Add sample information to seurat object

# create sample column
barcodes <- colnames(data)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[, 2]
table(sample)
## sample
## Trem2_KO_5XFAD_Cor Trem2_KO_5XFAD_Hip       Trem2_KO_Cor       Trem2_KO_Hip 
##               6670               3215               6912               2959 
##       WT_5XFAD_Cor       WT_5XFAD_Hip             WT_Cor             WT_Hip 
##               4816               4010              10549               3389
data$sample <- factor(
  sample,
  levels = c(
      "WT_Cor",
      "Trem2_KO_Cor",
      "WT_5XFAD_Cor",
      "Trem2_KO_5XFAD_Cor",
      "WT_Hip",
      "Trem2_KO_Hip",
      "WT_5XFAD_Hip",
      "Trem2_KO_5XFAD_Hip"
  )
)
table(data$sample)  # check
## 
##             WT_Cor       Trem2_KO_Cor       WT_5XFAD_Cor Trem2_KO_5XFAD_Cor 
##              10549               6912               4816               6670 
##             WT_Hip       Trem2_KO_Hip       WT_5XFAD_Hip Trem2_KO_5XFAD_Hip 
##               3389               2959               4010               3215
Idents(data) <- data$sample

Add genotype column to metadata

# create treatment column
geno <- gsub("WT_Cor", WT, data$sample)
geno <- gsub("WT_Hip", WT, geno)
geno <- gsub("Trem2_KO_Cor", Trem2_KO, geno)
geno <- gsub("Trem2_KO_Hip", Trem2_KO, geno)
geno <- gsub("WT_5XFAD_Cor", WT_5XFAD, geno)
geno <- gsub("WT_5XFAD_Hip", WT_5XFAD, geno)
geno <- gsub("Trem2_KO_5XFAD_Cor", Trem2_KO_5XFAD, geno)
geno <- gsub("Trem2_KO_5XFAD_Hip", Trem2_KO_5XFAD, geno)

data$genotype <-
  factor(geno, levels = c(WT, Trem2_KO, WT_5XFAD, Trem2_KO_5XFAD))
table(data$genotype)
## 
##             WT       Trem2_KO       WT_5XFAD Trem2_KO_5XFAD 
##          13938           9871           8826           9885

Add brain_region column to metadata

# create treatment column
repl <- gsub("WT_Cor", brain_region1, data$sample)
repl <- gsub("Trem2_KO_Cor", brain_region1, repl)
repl <- gsub("WT_5XFAD_Cor", brain_region1, repl)
repl <- gsub("Trem2_KO_5XFAD_Cor", brain_region1, repl)
repl <- gsub("WT_Hip", brain_region2, repl)
repl <- gsub("Trem2_KO_Hip", brain_region2, repl)
repl <- gsub("WT_5XFAD_Hip", brain_region2, repl)
repl <- gsub("Trem2_KO_5XFAD_Hip", brain_region2, repl)

data$brain_region <- 
  factor(repl, levels = c(brain_region1, brain_region2))
table(data$brain_region )
## 
##   Cor   Hip 
## 28947 13573

Obtain quality information

# cell.complexity
data$cell.complexity <-
  log10(data$nFeature_RNA) / log10(data$nCount_RNA)

# get gene names
gene.names <- rownames(data)
# percent MT
mt.genes <- gene.names[grep("^mt-", gene.names)]
mt.genes
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
data$percent.mt <- PercentageFeatureSet(data, pattern = "^mt-")

# ribosomal proteins begin with 'Rpl' or 'Rsl' for this annotation file
ribo.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
ribo.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"
data$percent.ribo <- PercentageFeatureSet(data, pattern = "^R[sp]l")
# mitochondrial ribosomes
mt.ribo.genes <- gene.names[grep("^Mrp[sl]", gene.names)]
mt.ribo.genes
##  [1] "Mrpl15"  "Mrpl30"  "Mrps9"   "Mrpl44"  "Mrps14"  "Mrpl41"  "Mrps2"  
##  [8] "Mrps5"   "Mrps26"  "Mrps28"  "Mrpl47"  "Mrpl24"  "Mrpl9"   "Mrps21" 
## [15] "Mrpl50"  "Mrpl37"  "Mrps15"  "Mrpl20"  "Mrpl33"  "Mrpl1"   "Mrps18c"
## [22] "Mrps17"  "Mrps33"  "Mrpl35"  "Mrpl19"  "Mrpl53"  "Mrps25"  "Mrpl51" 
## [29] "Mrps35"  "Mrps12"  "Mrpl46"  "Mrps11"  "Mrpl48"  "Mrpl17"  "Mrpl23" 
## [36] "Mrpl54"  "Mrpl42"  "Mrps31"  "Mrpl34"  "Mrps16"  "Mrpl52"  "Mrpl57" 
## [43] "Mrpl4"   "Mrps22"  "Mrpl3"   "Mrps24"  "Mrpl22"  "Mrpl55"  "Mrps23" 
## [50] "Mrpl27"  "Mrpl10"  "Mrpl45"  "Mrps7"   "Mrpl38"  "Mrpl12"  "Mrpl32" 
## [57] "Mrpl36"  "Mrps27"  "Mrps36"  "Mrps30"  "Mrpl13"  "Mrpl40"  "Mrpl39" 
## [64] "Mrps6"   "Mrpl18"  "Mrps34"  "Mrpl28"  "Mrps18b" "Mrpl14"  "Mrps18a"
## [71] "Mrpl2"   "Mrps10"  "Mrpl21"  "Mrpl11"  "Mrpl49"  "Mrpl16"  "Mrpl43"
data$percent.mt.ribo <-
  PercentageFeatureSet(data, pattern = "^Mrp[sl]")

# hemoglobin genes
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
hb.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"
data$percent.hb <-
  PercentageFeatureSet(data, pattern = "^Hb[ba]-")

# percent Ttr in each cell
data$percent.Ttr <- PercentageFeatureSet(data, features = "Ttr")

# percent Apoe in each cell
data$percent.Apoe <- PercentageFeatureSet(data, features = "Apoe")

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
cell_count_data <- as.data.frame(table(data$sample))
colnames(cell_count_data) <- c("sample", "frequency")

ncellsRaw <-
  ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency),
            position = position_dodge(width = 0.9),
            vjust = -0.25) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000),
                     limits = c(0, 12000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ncellsRaw

Density plots

QC metrics

# Visualize nCount_RNA
denCount <- ggplot(data@meta.data,
  aes(color = sample,
  x = nCount_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nCount_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.position = "none")

# Visualize nCount_RNA
denFeature <- ggplot(data@meta.data,
  aes(color = sample,
  x = nFeature_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  geom_vline(xintercept = nFeature.max) +
  theme(legend.position = "none")

# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity <- ggplot(data@meta.data,
  aes(color = sample,
  x = cell.complexity,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(denCount, denFeature, denCellComplexity)
layout <- rbind(c(1), c(2), c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# Visualize percent.mt
denMt <- ggplot(data@meta.data,
  aes(color = sample,
  x = percent.mt,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.ribo
denRb <- ggplot(data@meta.data,
  aes(color = sample,
  x = percent.ribo,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Ribosomal Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.hb
denHb <- ggplot(data@meta.data,
  aes(color = sample,
  x = percent.hb,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(denMt, denRb, denHb)
layout <- rbind(c(1), c(2), c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

### gene-level

# Visualize percent.Ttr
denTtr <- ggplot(data@meta.data,
  aes(color = sample,
  x = log2(percent.Ttr),
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("percent Ttr") +
  ylab("Density") +
  theme(legend.position = "none")
denTtr
## Warning: Removed 30402 rows containing non-finite values (stat_density).

# Visualize percent.Apoe
denApoe <- ggplot(data@meta.data,
  aes(color = sample,
  x = log2(percent.Apoe),
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("percent Apoe") +
  ylab("Density") +
  theme(legend.position = "none")
denApoe
## Warning: Removed 17557 rows containing non-finite values (stat_density).

Violin plots

# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(
  data,
  features = c("nFeature_RNA", "nCount_RNA", "cell.complexity"),
  ncol = 3,
  group.by = 'sample',
  pt.size = 0
)

#  nFeature_RNA
vFeature <- VlnPlot(
  data,
  features = c("nFeature_RNA"),
  group.by = 'sample',
  pt.size = 0
) + theme(legend.position = "none")
vFeature

#  nCount_RNA
vCount <- VlnPlot(
  data,
  features = c("nCount_RNA"),
  group.by = 'sample',
  pt.size = 0
) + theme(legend.position = "none")
vCount

#  cell.complexity
vCC <- VlnPlot(
  data,
  features = c("cell.complexity"),
  group.by = 'sample',
  pt.size = 0
) + theme(legend.position = "none")
vCC

# percent mt, ribo, hb
vMtRiboHb <- VlnPlot(
  data,
  features = c("percent.mt", "percent.ribo", "percent.hb"),
  ncol = 3,
  group.by = 'sample',
  pt.size = 0
)

#  percent mt
vMt <- VlnPlot(
  data,
  features = c("percent.mt"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vMt

#  percent ribosomal
vRb <- VlnPlot(
  data,
  features = c("percent.ribo"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vRb

#  percent hemoglobin genes
vHb <- VlnPlot(
  data,
  features = c("percent.hb"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vHb

# Ttr
vTtr <- VlnPlot(
  data,
  features = c("percent.Ttr"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vTtr

#  Apoe
vApoe <- VlnPlot(
  data,
  features = c("percent.Apoe"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vApoe

Scatter plots

sMt <- ggplot(
  data@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  stat_smooth(method=lm) +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_vline(xintercept = nCount.max) +
  geom_hline(yintercept = nFeature.min) + 
  geom_hline(yintercept = nFeature.max) + 
  facet_wrap(~sample, ncol = 3) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt
## `geom_smooth()` using formula 'y ~ x'

sMt <- FeatureScatter(data,
               feature1 = "nCount_RNA",
               feature2 = "percent.mt",
               group.by = 'sample',
               shuffle = TRUE)
sMt

Filtering

Cell-level filtering

Filtering as defined by the manuscript

a cutoff of 300–9,000 UMI, 300–5,600 Gene was applied.

We will filter based on 6 conditions:
– nCount_RNA between 300 - 9,000 – nFeature_RNA between 300 - 5,600 – percent.mt < 5

# filter
data.filtered <- subset(
  data,
  subset = (nCount_RNA > nCount.min) &
    (nCount_RNA < nCount.max) &
    (nFeature_RNA > nFeature.min) &
    (nFeature_RNA < nFeature.max) &
    (percent.mt < mt.cutoff)
)

# print cells removed
print(paste0(dim(data)[2] - dim(data.filtered)[2], " cells removed"))
## [1] "8699 cells removed"
cell_count_data <- as.data.frame(table(data.filtered$sample))
colnames(cell_count_data) <- c("sample", "frequency")

ncellsFilter1 <-
  ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency),
  position = position_dodge(width = 0.9), vjust = -0.25) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000),
  limits = c(0, 12000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ncellsFilter1

# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFilter1)
layout <- cbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
counts <- GetAssayData(object = data.filtered, slot = "counts")
nonzero <- counts > 1  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite data.filtered
data.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = data.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "19995 features removed"
# which features (aka genes) were removed?
all.genes <- rownames(counts)
min.exp.genes <- rownames(counts.filtered)
genes.removed <- setdiff(all.genes, min.exp.genes)

Doublet filtering

https://github.com/chris-mcginnis-ucsf/DoubletFinder\

Heterotypic - doublets derived from transcriptionally distinct cells. DoubletFinder works best on this type of doublet.

Homotopic - Transcriptionally similar cell doublets. DoubletFinder does not work as great on this type of doublet.

pANN - proportion of artificial nearest neighbors (pANN)

BCMVN - mean-variance normalized bimodality coefficient of pANN distributions produced during pN -pK parameter sweeps. The BCMVN may be used to identify the pK parameter.

Overview of steps:
A. Prepare each sample
B. pK Identification (no ground-truth) - defines the PC neighborhood size used to compute pANN
C. Homotypic Doublet Proportion Estimate - homotypic doublets may not be a problem depending on the type of analysis you are performing. If you have some doublets of the same type and their counts are normalized, they will generally represent the profile of single cells of the same type.
D. DoubletFinder
E. Visualize where the doublets are located

save object

read in object

data.split <- readRDS(paste0("../../rObjects/", pathToTestType, 
              tolower(tissue),"_post_filter_split.rds"))
# converge data.split
data.singlets <- merge(
  x = data.split[[1]],
  y = c(
    data.split[[2]],
    data.split[[3]],
    data.split[[4]],
    data.split[[5]],
    data.split[[6]],
    data.split[[7]],
    data.split[[8]]
  ),
  project = paste0("Colonna data ", tissue, " Single Nucleus")
)


# print how many cells removed
print(paste0(dim(data.filtered)[2] - dim(data.singlets)[2], " cells removed"))
## [1] "838 cells removed"
# overwrite data.filtered
data.filtered <- data.singlets

# reset levels
data.filtered$genotype <- factor(data.filtered$genotype,
                                 levels = c("WT", "Trem2_KO", "WT_5XFAD", "Trem2_KO_5XFAD"))

data.filtered$brain_region <- factor(data.filtered$brain_region,
                                  levels = c(brain_region1, brain_region2))

data.filtered$sample <- factor(
  data.filtered$sample,
  levels = sample_order
)

# cleanup
remove(data.singlets,
       data.split,
       data_sample_singlets,
       data_sample)
## Warning in remove(data.singlets, data.split, data_sample_singlets, data_sample):
## object 'data_sample_singlets' not found
## Warning in remove(data.singlets, data.split, data_sample_singlets, data_sample):
## object 'data_sample' not found
remove(n_cells,
       n_cells_singlets,
       ncell_matrix,
       ncell_max,
       ncells_per_cluster,
       ncells_melt)
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'n_cells' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'n_cells_singlets' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncell_matrix' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncell_max' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncells_per_cluster' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncells_melt' not found
remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, sum.stdv, stdv)
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sweep.res.list' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sweep.stats' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'bcmvn' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'bcmvn_max' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'difference' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sum.stdv' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'stdv' not found
remove(d1, d2, d3, f1, f2, b1)
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd1' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd2' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd3' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'f1' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'f2' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'b1' not found
remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc)
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'co1' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'co2' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'cellmax' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'min.pc' not found

Histogram

All samples together

# User params
goi <- "Ttr"
sample_oi <- "ALL"
threshold <- 2

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(data.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <-
  paste0("Colonna Brain Cell: ",
         goi,
         " - ",
         sample_oi,
         "\nnCount_RNA > ",
         threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") + 
  theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0("Log2(", goi, " nCount_RNA)")) + ylab("# of cells") + 
  theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = log2.threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = log2.threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )

# plot
plots <- list(hist1, hist2)
layout <- rbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)
## 
## FALSE  TRUE 
## 24879  8104

Malat1 all samples

# User params
goi <- "Malat1"
threshold <- 0.1

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(data.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <-
  paste0("Colonna Brain Cell: ", goi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of Samples") +
  theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0("Log2(", goi, " nCount_RNA)")) + ylab("# of Samples") +
  theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = log2.threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = log2.threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )
# plot
plots <- list(hist1, hist2)
layout <- rbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)
## 
##  TRUE 
## 32983

Split by sample

Ttr by sample

## [1] 1
## [1] 1

## [1] 2
## [1] 2

## [1] 3
## [1] 3

## [1] 4
## [1] 4

## [1] 5
## [1] 5

## [1] 6
## [1] 6

## [1] 7
## [1] 7

## [1] 8
## [1] 8

Number of cells

# Visualize the number of cell counts per sample
cell_count_data <- as.data.frame(table(data.filtered$sample))
colnames(cell_count_data) <- c("sample", "frequency")

ncellsFiltered <-
  ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency),
            position = position_dodge(width = 0.9),
            vjust = -0.25) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000),
                     limits = c(0, 12000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFiltered)
layout <- cbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# Visualize nCount_RNA
denCount_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = nCount_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nCount_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.position = "none")

# Visualize nCount_RNA
denFeature_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = nFeature_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  geom_vline(xintercept = nFeature.max) +
  theme(legend.position = "none")

# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = cell.complexity,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.mt
denMt_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.mt,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(
  denCount,
  denFeature,
  denCellComplexity,
  denCount_filter,
  denFeature_filter,
  denCellComplexity_filter
)
layout <- rbind(c(1, 4), c(2, 5), c(3, 6))
grid_den_raw_filter <-
  grid.arrange(grobs = plots, layout_matrix = layout)

# Visualize percent.mt
denMt_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.mt,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.ribo
denRb_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.ribo,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Ribosomal Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.hb
denHb_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.hb,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(denMt, denRb, denHb, denMt_filter, denRb_filter, denHb_filter)
layout <- rbind(c(1, 4), c(2, 5), c(3, 6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

save

Density plots for genes of interest

# Visualize the number of counts per cell
denApoe_filter <- ggplot(data.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Apoe),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("percent Apoe") +
  ylab("Density") +
  theme(legend.position = "none")
denApoe_filter
## Warning: Removed 14922 rows containing non-finite values (stat_density).

denTtr_filter <- ggplot(data.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Ttr),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  xlab("percent Ttr") +
  ylab("Density") +
  theme(legend.position = "none")
denTtr_filter
## Warning: Removed 23827 rows containing non-finite values (stat_density).

# Arrange graphs in grid
plots <- list(denApoe, denTtr,  denApoe_filter, denTtr_filter)
layout <- rbind(c(1,3), c(2,4))
grid_den_raw_filter_genes <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning: Removed 17557 rows containing non-finite values (stat_density).
## Warning: Removed 30402 rows containing non-finite values (stat_density).
## Warning: Removed 14922 rows containing non-finite values (stat_density).
## Warning: Removed 23827 rows containing non-finite values (stat_density).

Violin plots

# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(data.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              pt.size = 0)
vCellInfo

#  percent Mt, ribo, hemo
vMtRiboHb <- VlnPlot(data.filtered,
              features = c("percent.mt","percent.ribo","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              pt.size = 0.001)
vMtRiboHb

#  percent Ttr
vTtr <- VlnPlot(data.filtered,
              features = c("percent.Ttr"),
              group.by = 'sample',
              pt.size = 0)
vTtr

#  percent Apoe
vApoe<- VlnPlot(data.filtered,
              features = c("percent.Apoe"),
              group.by = 'sample',
              pt.size = 0)
vApoe

Scatter plots

sMt <- ggplot(
  data.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  stat_smooth(method=lm) +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_vline(xintercept = nCount.max) + 
  geom_hline(yintercept = nFeature.min) + 
  geom_hline(yintercept = nFeature.max) +
  facet_wrap(~sample, ncol = 4) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt
## `geom_smooth()` using formula 'y ~ x'

sMt <- FeatureScatter(data.filtered,
               feature1 = "nCount_RNA",
               feature2 = "percent.mt",
               group.by = 'sample',
               shuffle = TRUE)
sMt

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(data.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  xlab("Sample") +
  theme(legend.position = "none")
b1

Top transcripts

df <- data.frame(row.names = rownames(data.filtered))
df$rsum <- rowSums(x = data.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 20)
##             rsum gene_name
## Malat1  16890260    Malat1
## Meg3     1377819      Meg3
## Snhg11    595771    Snhg11
## Rtn1      180953      Rtn1
## Ube3a     170010     Ube3a
## Gria2     128412     Gria2
## Apoe      107107      Apoe
## Acp1       96951      Acp1
## Rsrp1      95958     Rsrp1
## Cst3       84541      Cst3
## PISD       83844      PISD
## Atp1b1     78768    Atp1b1
## mt-Atp6    76932   mt-Atp6
## mt-Co3     76634    mt-Co3
## Grin2b     76361    Grin2b
## Ntm        72273       Ntm
## Itm2b      68363     Itm2b
## Pcsk2      65350     Pcsk2
## Tecr       62821      Tecr
## Ptgds      61343     Ptgds

Next steps

For something to be informative, it needs to exhibit variation, but not all variation is informative. The goal of our clustering analysis is to keep the major sources of variation in our dataset that should define our cell types, while restricting the variation due to uninteresting sources of variation (sequencing depth, cell cycle differences, mitochondrial expression, batch effects, etc.). Then, to determine the cell types present, we will perform a clustering analysis using the most variable genes to define the major sources of variation in the dataset.

  1. Explain sources of unwanted variation: cell cycle, percent.mt (cell stress)
  2. Normalizing and regressing out sources of unwanted variation: sctransform
  3. Integration (OPTIONAL STEP): recommended when comparing groups, uses shared sources of high variation to identify shared sub-populations across conditions, we visualize after integration to ensure it’s good before clustering
    • Stuart and Butler et al. (2018)
  4. Clustering cells: cluster cells based on similarity of their gene expression profiles, cells are assigned to a cluster based on their PCA scores based on expression of integrated highly variable genes
  5. Cluster quality evaluation: see what clusters aren’t influenced by sources of uninteresting variation, check major PCs are driving clusters, explore cell identities by looking a known markers

Cell cycle

The most common biological data correction is to remove the effects of the cell cycle on the transcriptome. This data correction can be performed by a simple linear regression against a cell cycle score.

Check cell cycle phase BEFORE doing sctransform. Counts to need to be comparable between cells and each cell has a different number for nCount_RNA.

# summary of counts per cell
summary(data.filtered@meta.data$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     310     816    1115    1266    1534    8303

Use the NormalizeData() function with the argument LogNormalze to account for sequencing depth. nCount_RNA for each gene is divided by the total nCount_RNA for that cell. This is done for all cells. This number is then multiplied by the scale.factor so we don’t have to deal with a tiny number. This number is then natural-log transformed using log1p. log1p is the natural logarithm (base e) of 1 + count. The 1 will prevent taking the log of 0.

data.phase <- NormalizeData(data.filtered,
                            scale.factor = 10000, # default
                            normalization.method = "LogNormalize" # default
                            )

Give each cell a score based on expression of G1, G2/M, and S phase markers. A list of markers is provided for humans. We will need to convert human gene names to mouse gene names. We will use the CellCycleScoring() function in seurat.

Below is a resource for acquiring cell markers for other organisms https://hbctraining.github.io/scRNA-seq_online/lessons/cell_cycle_scoring.html

G1 ~10 hrs S ~5-6 hrs G2 ~3-4 hrs M ~2 hrs

G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise the the greatest positive value between S.Score and G2M.Score determines the phase.

Using a human list for mouse https://github.com/satijalab/seurat/issues/2493

# Basic function to convert human to mouse gene names
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", 
                 values = x , mart = human, attributesL = c("mgi_symbol"), 
                 martL = mouse, uniqueRows=T)
humanx <- unique(genesV2[, 2])
# Print the first 6 genes found to the screen
print(head(humanx))
return(humanx)
}

m.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
m.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
saveRDS(m.s.genes, "../../results/geneLists/m.s.genes.rds")
saveRDS(m.g2m.genes, "../../results/geneLists/m.g2m.genes.rds")
m.s.genes <- readRDS("../../results/geneLists/m.s.genes.rds")
m.g2m.genes <- readRDS("../../results/geneLists/m.g2m.genes.rds")

# score cells for cell cycle
data.phase <- CellCycleScoring(data.phase,
                               g2m.features = m.g2m.genes,
                               s.features = m.s.genes,
                               set.ident = TRUE)
## Warning: The following features are not present in the object: Mcm4, Rrm2,
## Uhrf1, Chaf1b, Exo1, Gmnn, Cdc45, Fen1, Cdc6, Cenpu, Dscc1, Hells, Rad51ap1,
## Tipin, Prim1, Clspn, Cdca7, Mcm6, Mcm5, Pola1, Usp1, Ung, Blm, Casp8ap2, Pcna,
## Polr1b, Rrm1, Mcm7, Rad51, Ccne2, E2f8, not searching for symbol synonyms
## Warning: The following features are not present in the object: Gtse1, Smc4,
## Dlgap5, Cdc25c, Kif20b, Cdca2, Cks2, Ttk, Cdk1, Cdca3, Ckap2, Hmgb2, Gas2l3,
## Tacc3, Cdc20, Birc5, Lbr, Anln, Cks1b, Kif2c, Hmmr, Cenpa, Kif23, Aurkb, Kif11,
## Tpx2, Ccnb2, Cdca8, Ndc80, Cenpe, Nek2, Ect2, Jpt1, Bub1, Pimreg, Nusap1,
## Ckap2l, Aurka, Ube2c, Nuf2, not searching for symbol synonyms

Bar graph

cellcyclecount_barplot <- 
  as_tibble(data.phase[[]]) %>%
  ggplot(aes(Phase, fill = Phase)) + geom_bar()
cellcyclecount_barplot

percent.phase <- data.phase@meta.data %>%
  group_by(sample, Phase) %>%
  dplyr::count() %>%
  group_by(sample) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = sample, y = percent, fill = Phase)) +
  geom_col() +
  ggtitle("Percentage of phase per sample") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
percent.phase

Pie chart

# pie point 
cellcyclecount_piepoint <- 
  as_tibble(data.phase[[]]) %>%
  ggplot(aes(x=S.Score, y=G2M.Score, color=Phase)) + 
  geom_point()
cellcyclecount_piepoint

Top variable genes

Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.

# Identify the most variable genes
data.phase <- FindVariableFeatures(data.phase,
                                   selection.method = "vst",  # default vst
                                   nfeatures = 2000,  # default 2000
                                   verbose = FALSE)

# view top variable genes
top40 <- head(VariableFeatures(data.phase), 40)
top40
##  [1] "Ttr"    "Ptgds"  "Vip"    "Npy"    "Sst"    "Apoe"   "Vtn"    "Apod"  
##  [9] "Myoc"   "Cldn5"  "Cd74"   "Mgp"    "Acta2"  "Ly6c1"  "Spp1"   "C1qa"  
## [17] "Tac2"   "Plp1"   "Igfbp7" "Avp"    "Cst3"   "Igfbp2" "Trf"    "Klk6"  
## [25] "Ly6a"   "Clu"    "Atp1a2" "Hexb"   "Nts"    "Rgs5"   "Crhbp"  "Crip1" 
## [33] "Myl9"   "Trh"    "Ctss"   "Cort"   "Flt1"   "Trem2"  "C1qc"   "Tac1"
# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(data.phase, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot, 
                    points = top40, repel = TRUE, fontface="italic", 
                    xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel

Mean-variance

# The variability information can be accessed using the HVFInfo method. 
# The names of the variable features can be accessed with VariableFeatures().
variance.data <- as_tibble(HVFInfo(data.phase),rownames = "Gene")
variance.data <- variance.data %>% mutate(hypervariable=Gene %in% VariableFeatures(data.phase))

# We can plot out a graph of the variance vs mean and highlight the selected genes 
# this way, we can see whether we think we’re likely to capture what we need.
subset_data <- subset(variance.data, hypervariable == TRUE)
varGeneslog <- variance.data %>% 
  ggplot(aes(log(mean),log(variance),color=hypervariable)) + 
  geom_point() + 
  scale_color_manual(values=c("black","red")) +  geom_text_repel(
    data = subset_data, max.overlaps = 20,
    aes(
      x = log(mean),
      y = log(variance),
      label = Gene, 
      fontface="italic",),segment.alpha = 1,size = 4) +
  theme(legend.position="bottom")
varGeneslog

remove(subset_data)

PCA

See if the cell cycle is a major source of variation using PCA. We scale the data because highly expressed genes exhibit the highest amount of variation and we don’t want our ‘highly variable genes’ only to reflect high expression.

vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then, feature values are standardized using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).

The ScaleData() function in Seurat will adjust gene expressions so that the mean expression in each cell is 0. It will also scale each gene to give a variance of 1 for each cell.

# Scale the counts
data.phase <- ScaleData(data.phase)
## Centering and scaling data matrix
data.phase@assays
## $RNA
## Assay data with 8003 features for 32983 cells
## Top 10 variable features:
##  Ttr, Ptgds, Vip, Npy, Sst, Apoe, Vtn, Apod, Myoc, Cldn5
data.phase.pca <- RunPCA(data.phase, nfeatures.print = 10)
## PC_ 1 
## Positive:  Apoe, Atp1a2, Plpp3, Gpr37l1, Slc1a3, Cd63, Gja1, Mfge8, F3, Htra1 
## Negative:  Meg3, Snhg11, Opcml, Gm12027, R3hdm1, Fam19a1, Atp2b1, 2900055J20Rik, Epha5, Chgb 
## PC_ 2 
## Positive:  Ctss, C1qa, C1qc, C1qb, Laptm5, Ctsh, Trem2, Fcrls, Hexb, Selplg 
## Negative:  Gpr37l1, Plpp3, Mfge8, Cldn10, Pla2g7, Gja1, F3, Slc7a10, Ntsr2, Slc1a3 
## PC_ 3 
## Positive:  C1ql2, Zbtb20, C1ql3, Fam19a1, Ttr, Icam5, Galnt1, Atp2b1, Gm12027, Nell2 
## Negative:  Snhg11, Slc32a1, Meg3, Cit, Slc6a1, Resp18, Npy, Kcnc1, Nxph1, Sst 
## PC_ 4 
## Positive:  Slc32a1, Synpr, C1ql2, Scg2, Ttr, Slc6a1, Spock3, Nxph1, Gria1, Pnoc 
## Negative:  R3hdm1, Ptn, Ptprd, Mef2c, Stx1a, Vtn, Ly6e, Cck, Ntm, Lmo4 
## PC_ 5 
## Positive:  Vtn, Slc6a20a, Slc6a13, Pltp, Aebp1, Itm2a, Pcolce, Slc22a8, Cxcl12, Dcn 
## Negative:  Ctss, C1qa, C1qc, Selplg, C1qb, Laptm5, Fcrls, Hexb, Csf1r, Trem2

If the plots for each phase look very similar to each other, do not regress out variation due to cell cycle. You can plot PC1 vs PC2 before and after regression to see how effective it was. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

# Plot the PCA colored by cell cycle phase
cycle.pca.phase <- DimPlot(data.phase.pca,
        reduction = "pca",
        group.by= "Phase",
        split.by = "Phase")
cycle.pca.phase

cycle.pca <- DimPlot(data.phase.pca,
               reduction = "pca",
               group.by = "Phase",
               shuffle = TRUE)
cycle.pca

clean up seurat objects

remove(data_sample, data.filtered, data.filtered.split, data.phase.pca, cycle.pca.phase)

SCTransform

Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model. ??Variance is also adjusted based on pooling information across genes with similar abundances??

Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase and so we chose not to regress this out of our data. We observed some effect of mitochondrial expression and so we choose to regress this out from the data.

Since we have several samples in our dataset, we want to keep them as separate objects and transform them as that is what is required for integration. We will first split the cells in seurat.phase object by sample.

data.split <- SplitObject(data.phase, split.by = "sample")
# check
data.split
## $WT_Cor
## An object of class Seurat 
## 8003 features across 9694 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $Trem2_KO_Cor
## An object of class Seurat 
## 8003 features across 6158 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $WT_5XFAD_Cor
## An object of class Seurat 
## 8003 features across 3985 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $Trem2_KO_5XFAD_Cor
## An object of class Seurat 
## 8003 features across 2919 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $WT_Hip
## An object of class Seurat 
## 8003 features across 2619 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $Trem2_KO_Hip
## An object of class Seurat 
## 8003 features across 2241 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $WT_5XFAD_Hip
## An object of class Seurat 
## 8003 features across 3339 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)
## 
## $Trem2_KO_5XFAD_Hip
## An object of class Seurat 
## 8003 features across 2028 samples within 1 assay 
## Active assay: RNA (8003 features, 2000 variable features)

Now we will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.

Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb) using the following code:

options(future.globals.maxSize = 4000 * 1024^5)

for (i in 1:length(data.split)) {
  print(paste0("Sample ", i))
  data.split[[i]] <- SCTransform(data.split[[i]], 
                                 verbose = FALSE
                                 )
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"

NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument.

Note, the last line of output specifies “Set default assay to SCT”. We can view the different assays that we have stored in our seurat object.

A thread about whether or not regress out batch: https://github.com/satijalab/seurat/issues/3270 It is suggested to not regress out batch, and instead use a data integration method

# Check
data.split
## $WT_Cor
## An object of class Seurat 
## 16003 features across 9694 samples within 2 assays 
## Active assay: SCT (8000 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_Cor
## An object of class Seurat 
## 15998 features across 6158 samples within 2 assays 
## Active assay: SCT (7995 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_5XFAD_Cor
## An object of class Seurat 
## 15996 features across 3985 samples within 2 assays 
## Active assay: SCT (7993 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_5XFAD_Cor
## An object of class Seurat 
## 15965 features across 2919 samples within 2 assays 
## Active assay: SCT (7962 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_Hip
## An object of class Seurat 
## 15952 features across 2619 samples within 2 assays 
## Active assay: SCT (7949 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_Hip
## An object of class Seurat 
## 15933 features across 2241 samples within 2 assays 
## Active assay: SCT (7930 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_5XFAD_Hip
## An object of class Seurat 
## 15981 features across 3339 samples within 2 assays 
## Active assay: SCT (7978 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_5XFAD_Hip
## An object of class Seurat 
## 15914 features across 2028 samples within 2 assays 
## Active assay: SCT (7911 features, 3000 variable features)
##  1 other assay present: RNA

Integration

Run harmony

Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.

To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.

Now, using our SCTransform object as input, let’s perform the integration across conditions.

First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.

# Choose the features to use when integrating multiple datasets. 
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = data.split, 
                                          nfeatures = 3000)

# merge the data
data.sct.merged <- merge(x = data.split[[1]],
  y = c(
    data.split[[2]],
    data.split[[3]],
    data.split[[4]],
    data.split[[5]],
    data.split[[6]],
    data.split[[7]],
    data.split[[8]]
  ),
  project = paste0("Colonna data ", tissue, " Single Nucleus")
)

# define the variable features 
VariableFeatures(data.sct.merged) <- var.features

save rObject

insepct data

data.sct.merged <- RunPCA(data.sct.merged, assay = "SCT")
## PC_ 1 
## Positive:  Ube3a, Snhg11, Fam19a1, Pcsk2, Grin2b, Acp1, Gria2, Rtn1, R3hdm1, Cul4a 
##     PISD, Opcml, Gm12027, Syt1, Matk, Cck, Arpp21, Atp2b1, Celf2, Grin2a 
##     2900055J20Rik, Mef2c, Epha5, Kalrn, Miat, Thy1, Rsrp1, Chgb, Ptprn, Csmd1 
## Negative:  Apoe, Cst3, Atp1a2, Clu, Plpp3, Cd81, Gpr37l1, Cd63, Ptgds, Slc1a3 
##     Mfge8, Cd9, Gja1, Apod, Slc1a2, Plp1, Htra1, Pla2g7, F3, Sparcl1 
##     Ntsr2, Cldn10, Bcan, Ptn, mt-Atp6, mt-Co3, Serpine2, S1pr1, mt-Co1, Ptprz1 
## PC_ 2 
## Positive:  Apod, Plp1, Ptgds, Trf, Cldn11, C1qa, Mag, Hexb, Sst, C1qc 
##     Ctss, C1qb, Cd9, Mal, mt-Atp6, Npy, mt-Co1, mt-Co3, Vtn, Ctsd 
##     mt-Cytb, mt-Nd1, Resp18, Mog, Slc32a1, Bsg, Fth1, mt-Co2, Lgmn, Itm2a 
## Negative:  Plpp3, Clu, Gpr37l1, Mfge8, Apoe, Slc1a3, Gja1, Slc1a2, Cldn10, Ntsr2 
##     Pla2g7, F3, Htra1, Bcan, Atp1a2, Slc7a10, Btbd17, S1pr1, Fam19a1, Cst3 
##     Mlc1, Ntm, Aqp4, Gm3764, Sparcl1, Nkain4, Lcat, Slc6a11, Ptprz1, Phkg1 
## PC_ 3 
## Positive:  Apod, Plp1, Ptgds, Fam19a1, Trf, Cldn11, Mag, Mal, Cd9, C1qa 
##     Pcsk2, Cul4a, Mog, Vtn, C1qc, Neat1, C1qb, Ctss, Baiap2, R3hdm1 
##     Enpp2, Hexb, Celf2, C1ql3, Acp1, Arpp21, Gsn, Gm12027, Kalrn, Apoe 
## Negative:  Sst, Npy, Resp18, Slc32a1, Atp1b1, Slc6a1, Synpr, Crhbp, Scg2, Snhg11 
##     Vip, Clu, Scg5, Atp6ap2, Nxph1, Pcsk1n, Kcnc1, Ly6h, Pnoc, Rbp4 
##     Serpini1, Stmn3, Rtn1, Dner, Sparcl1, Cacna2d2, Cxcl14, Podxl2, Psap, Ache 
## PC_ 4 
## Positive:  Plp1, Trf, Cd9, Mag, Cldn11, Hexb, C1qa, C1qc, Cst3, Ctss 
##     Cd81, C1qb, Mal, Mog, Selplg, Ctsd, Pllp, Aplp1, Sox10, Lgmn 
##     Scd2, Laptm5, Stmn4, Fcrls, Apoe, Gpr37, Opalin, Csf1r, Tnfaip6, P2ry12 
## Negative:  Vtn, Atp1a2, Itm2a, Igfbp7, Pltp, Igfbp2, mt-Atp6, Ifitm3, mt-Co1, mt-Co3 
##     Cxcl12, mt-Cytb, Slc6a20a, Aebp1, Id3, Slc6a13, Slc7a11, Mgp, Dcn, Slc22a8 
##     Pcolce, Slco1a4, Vim, Tm4sf1, Itih5, Hspb1, Serping1, Klf2, Id1, mt-Nd4 
## PC_ 5 
## Positive:  Plp1, Apod, Ptgds, Cldn11, Trf, Mag, Mal, Mog, Neat1, Apoe 
##     Pllp, Gsn, Sox10, Scd2, Enpp2, Ptn, Tnfaip6, Atp1a2, Opalin, Gpr37 
##     C4b, Hapln2, Stmn4, Igfbp2, Vtn, Sst, Aplp1, Aebp1, Pdgfra, Gjc2 
## Negative:  C1qa, Hexb, Cst3, Ctss, C1qc, C1qb, Selplg, Ctsd, Laptm5, Fcrls 
##     Csf1r, Fcer1g, P2ry12, Cx3cr1, Tyrobp, Tmem119, Cyba, Gpr34, Lgmn, Olfml3 
##     Vsir, Hmha1, Unc93b1, Cd68, P2ry13, Siglech, Ctsz, Sparc, Lag3, B2m

Plot PCA

pcabrain_region <- DimPlot(data.sct.merged,
        reduction = "pca",
        group.by = "brain_region")
pcabrain_region

pcabrain_region_split <- DimPlot(data.sct.merged,
        reduction = "pca",
        split.by = "brain_region",
        group.by = "brain_region")
pcabrain_region_split

pcagenotype <- DimPlot(data.sct.merged,
        reduction = "pca",
        group.by = "genotype")
pcagenotype

pcagenotype_split <- DimPlot(data.sct.merged,
        reduction = "pca",
        split.by = "genotype",
        group.by = "genotype")
pcagenotype_split

pcasample <- DimPlot(data.sct.merged,
        reduction = "pca",
        split.by = "sample",
        group.by = "sample")
pcasample

pcasample_split <- DimPlot(data.sct.merged,
        reduction = "pca",
        group.by = "sample",
        shuffle = TRUE)
pcasample_split

# clean up
remove(pcabrain_region, pcabrain_region_split, pcagenotype, pcagenotype_split, pcasample, pcasample_split)

run harmony

# run harmony to harmonize over samples 
data.integrated <- RunHarmony(object = data.sct.merged, 
                              group.by.vars = "sample", 
                              assay.use = "SCT",
                              reduction = "pca", 
                              plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony converged after 8 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

Check output

# first put the data back in place
DefaultAssay(data.integrated) <- "SCT"
Idents(data.integrated) <- "sample"
data.integrated$sample <- factor(data.integrated$sample, 
                                  levels = sample_order)
# check PCA
p1 <- DimPlot(object = data.integrated, 
              reduction = "harmony",
              group.by = "sample",
              shuffle = TRUE)
p1

p2 <- VlnPlot(object = data.integrated, 
              features = "harmony_1", 
              group.by = "brain_region", 
              pt.size = 0) 

p3 <- VlnPlot(object = data.integrated, 
              features = "harmony_1", 
              group.by = "brain_region", 
              pt.size = 0) + NoLegend()
p3

# clean up
remove(p1, p2, p3)

Top variable features

Top 20 variable features

top20 <- data.integrated@assays$SCT@var.features[1:20]
top20
##  [1] "Apoe"   "Sst"    "Vip"    "Ptgds"  "Npy"    "Apod"   "Plp1"   "Vtn"   
##  [9] "Cst3"   "Atp1a2" "Clu"    "Tac2"   "Cd9"    "Ttr"    "Hexb"   "C1qa"  
## [17] "Plpp3"  "Tac1"   "Trf"    "Igfbp2"

PCA plot

After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.

To generate these visualizations with the harmony output, use reduction = “harmony”

# Plot PCA
pcabrain_region <- DimPlot(data.integrated,
        reduction = "harmony",
        group.by = "brain_region")
pcabrain_region

pcabrain_regionSplit <- DimPlot(data.integrated,
        reduction = "harmony",
        split.by = "brain_region",
        group.by = "brain_region")
pcabrain_regionSplit

pcaGenotype <- DimPlot(data.integrated,
        reduction = "harmony",
        group.by = "genotype",
        shuffle = TRUE)
pcaGenotype

pcaGenotypeSplit <- DimPlot(data.integrated,
        reduction = "harmony",
        split.by = "genotype",
        group.by = "genotype",
        shuffle = TRUE)
pcaGenotypeSplit

pcaSample <- DimPlot(data.integrated,
        reduction = "harmony",
        group.by = "sample",
        shuffle = TRUE)
pcaSample

pcaSampleSplit <- DimPlot(data.integrated,
        reduction = "harmony",
        split.by = "sample",
        group.by = "sample")
pcaSampleSplit

## png 
##   2
## null device 
##           1
## pdf 
##   2
## null device 
##           1
## pdf 
##   2
## null device 
##           1

Find significant PCs

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# Printing out the most variable genes driving PCs
print(x = data.integrated[["pca"]], 
      dims = 1:10, 
      nfeatures = 10)
## PC_ 1 
## Positive:  Ube3a, Snhg11, Fam19a1, Pcsk2, Grin2b, Acp1, Gria2, Rtn1, R3hdm1, Cul4a 
## Negative:  Apoe, Cst3, Atp1a2, Clu, Plpp3, Cd81, Gpr37l1, Cd63, Ptgds, Slc1a3 
## PC_ 2 
## Positive:  Apod, Plp1, Ptgds, Trf, Cldn11, C1qa, Mag, Hexb, Sst, C1qc 
## Negative:  Plpp3, Clu, Gpr37l1, Mfge8, Apoe, Slc1a3, Gja1, Slc1a2, Cldn10, Ntsr2 
## PC_ 3 
## Positive:  Apod, Plp1, Ptgds, Fam19a1, Trf, Cldn11, Mag, Mal, Cd9, C1qa 
## Negative:  Sst, Npy, Resp18, Slc32a1, Atp1b1, Slc6a1, Synpr, Crhbp, Scg2, Snhg11 
## PC_ 4 
## Positive:  Plp1, Trf, Cd9, Mag, Cldn11, Hexb, C1qa, C1qc, Cst3, Ctss 
## Negative:  Vtn, Atp1a2, Itm2a, Igfbp7, Pltp, Igfbp2, mt-Atp6, Ifitm3, mt-Co1, mt-Co3 
## PC_ 5 
## Positive:  Plp1, Apod, Ptgds, Cldn11, Trf, Mag, Mal, Mog, Neat1, Apoe 
## Negative:  C1qa, Hexb, Cst3, Ctss, C1qc, C1qb, Selplg, Ctsd, Laptm5, Fcrls 
## PC_ 6 
## Positive:  Sst, Npy, Igfbp2, Atp1a2, Vtn, Apod, Ptgds, Slc7a11, Crhbp, Slc6a13 
## Negative:  mt-Atp6, mt-Co3, Atp2b1, mt-Co1, C1ql3, Gpm6a, Rtn1, Aplp1, mt-Nd1, Chgb 
## PC_ 7 
## Positive:  Pdgfra, Serpine2, Matn4, Cd9, C1ql1, Ptprz1, Tmem176b, Sox10, Ptn, Gpr17 
## Negative:  Plp1, Trf, Cst3, Clu, Slc1a2, Cldn11, Mag, Gja1, Sst, Plpp3 
## PC_ 8 
## Positive:  Ptgds, Atp1a2, Apod, Igfbp2, Vtn, Slc7a11, Dcn, Slc6a13, Aebp1, Rtn1 
## Negative:  Cd9, Sst, Sox10, Plp1, Npy, Matn4, mt-Co1, mt-Co3, mt-Atp6, Ptprz1 
## PC_ 9 
## Positive:  Tac1, Penk, Synpr, Zbtb20, Sst, Scg2, Car12, Gpr88, Mhrt, Npy 
## Negative:  Brinp3, Snhg11, Tshz2, Ntm, Ptprd, Cck, R3hdm1, Rsrp1, Pde4d, Ube3a 
## PC_ 10 
## Positive:  Sst, Npy, Crhbp, Rbp4, Ntm, Fam19a1, Cort, Chgb, C1ql3, Atp2b1 
## Negative:  Vip, Cxcl14, Tac2, Cnr1, Penk, Htr3a, Crh, Pnoc, Tac1, Cpne7

Quantitative approach to an elbow plot - The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation. - The point where the percent change in variation between the consecutive PCs is less than 0.1%.

First metric

# Determine percent of variation associated with each PC
stdv <- data.integrated[["pca"]]@stdev
sum.stdv <- sum(data.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100

# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)

# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 43

Second metric

# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
  (percent.stdv[1:length(percent.stdv) - 1] - 
     percent.stdv[2:length(percent.stdv)]) > 0.1), 
  decreasing = T)[1] + 1

# last point where change of % of variation is more than 0.1%.
co2
## [1] 22

Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.

# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 22

Elbow plot

Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

UMAP

# Run UMAP
data.integrated <- RunUMAP(data.integrated,
                           dims = 1:min.pc,
                           reduction = "harmony",
                           n.components = 3) # set to 3 to use with VR
DimPlot(data.integrated,
        group.by = "brain_region",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "brain_region",
        split.by = "brain_region",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "genotype",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "genotype",
        split.by = "genotype",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "sample",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "sample",
        split.by = "sample",
        shuffle = TRUE) + 
        NoLegend()

FeaturePlot(data.integrated, 
            features = "Apoe")

FeaturePlot(data.integrated, 
            features = "Ttr")

Cluster the cells

Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].

We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.

The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.

# Determine the K-nearest neighbor graph
data.unannotated <- FindNeighbors(object = data.integrated, 
                                 assay = "SCT", # set as default after SCTransform
                                 reduction = "harmony",
                                 dims = 1:min.pc)

# Determine the clusters for various resolutions
data.unannotated <- FindClusters(object = data.unannotated,
                                 algorithm = 1, # 1= Louvain
                                 resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9660
## Number of communities: 9
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9520
## Number of communities: 14
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9419
## Number of communities: 14
## Elapsed time: 5 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9321
## Number of communities: 15
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9225
## Number of communities: 15
## Elapsed time: 5 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9134
## Number of communities: 17
## Elapsed time: 5 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9063
## Number of communities: 18
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32983
## Number of edges: 1116677
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8994
## Number of communities: 20
## Elapsed time: 5 seconds

Explore resolutions

# 0.1
umap0.1 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE)
umap0.1

# 0.2
umap0.2 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE)
umap0.2

# 0.3
umap0.3 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE)
umap0.3

# 0.4
umap0.4 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE)
umap0.4

Clustering QC

Treatment, sample, phase

# brain_region
ubrain_region <- DimPlot(data.unannotated, 
        label = FALSE, 
        group.by = "SCT_snn_res.0.1",
        split.by = "brain_region") +
  NoLegend()
ubrain_region

# brain_region
ugenotype <- DimPlot(data.unannotated, 
        label = FALSE, 
        group.by = "SCT_snn_res.0.1",
        split.by = "genotype") +
  NoLegend()
ugenotype

# sample
usample <- DimPlot(data.unannotated, 
        label = FALSE,
        group.by = "SCT_snn_res.0.1",
        split.by = "sample") +
  NoLegend()
usample

# phase
uphase <- DimPlot(data.unannotated, 
        label = FALSE,
        group.by = "SCT_snn_res.0.1",
        split.by = "Phase") +
  NoLegend()
uphase

Percent of cell type markers

data.unannotated[["percent.neuron"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Snap25", "Syt1", "Gad1", "Gad2"))
data.unannotated[["percent.astrocytes"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Clu", "Gfap", "Aqp4"))
data.unannotated[["percent.microglia"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Hexb", "C1qa"))
data.unannotated[["percent.DAM"]] <-
  PercentageFeatureSet(data.unannotated,
                       features = c("Apoe", "Cst7", "Lyz2", "Lpl", "Cd9", "Trem2"))
data.unannotated[["percent.homeostatic"]] <-
  PercentageFeatureSet(data.unannotated, features = c("P2ry12", "Cx3cr1"))
data.unannotated[["percent.oligodendrocytes"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Plp1", "Mbp", "Mag"))
data.unannotated[["percent.OPCs"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Olig1", "Pdgfra", "Vcan"))

FeaturePlot(data.unannotated, features = "percent.neuron")

FeaturePlot(data.unannotated, features = "percent.astrocytes")

FeaturePlot(data.unannotated, features = "percent.microglia")

FeaturePlot(data.unannotated, features = "percent.DAM")

FeaturePlot(data.unannotated, features = "percent.homeostatic")

FeaturePlot(data.unannotated, features = "percent.oligodendrocytes")

FeaturePlot(data.unannotated, features = "percent.OPCs")

## Revist QC metrics

# nCount
fnCount <- FeaturePlot(data.unannotated, features = "nCount_RNA", order = TRUE)
fnCount

# nFeature
fnFeature <- FeaturePlot(data.unannotated, features = "nFeature_RNA", order = TRUE)
fnFeature

# percent.mt
fpercent.mt <- FeaturePlot(data.unannotated, features = "percent.mt", order = TRUE)
fpercent.mt

# cell.complexity
fcell.complexity <- FeaturePlot(data.unannotated, features = "cell.complexity", order = TRUE)
fcell.complexity

fS.Score <- FeaturePlot(data.unannotated, features = "S.Score", order = TRUE)
fS.Score

fG2M.Score <- FeaturePlot(data.unannotated, features = "G2M.Score", order = TRUE)
fG2M.Score

Percent cells per cluster

data.unannotated@meta.data$seurat_clusters <- 
  data.unannotated@meta.data$SCT_snn_res.0.3

# brain_region
bbrain_region <- data.unannotated@meta.data %>%
  group_by(seurat_clusters, brain_region) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=brain_region)) +
  geom_col() +
  ggtitle("Percentage of brain_region per cluster")
bbrain_region

# genotype
bgenotype <- data.unannotated@meta.data %>%
  group_by(seurat_clusters, genotype) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=genotype)) +
  geom_col() +
  ggtitle("Percentage of genotype per cluster")
bgenotype

# sample
bsample <- data.unannotated@meta.data %>%
  group_by(seurat_clusters, sample) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
  geom_col() +
  ggtitle("Percentage of sample per cluster")
bsample

Number cells per cluster

genotype_ncells <- FetchData(data.unannotated, 
                     vars = c("ident", "genotype")) %>%
  dplyr::count(ident, genotype) %>%
  tidyr::spread(ident, n)
write.table(genotype_ncells, 
            paste0("../../results/", pathToTestType, "nCells/",
                   tolower(tissue),
                   "_cells_per_cluster_genotype.txt"),
            quote = FALSE, sep = "\t")

sample_ncells <- FetchData(data.unannotated, 
                     vars = c("ident", "sample")) %>%
  dplyr::count(ident,sample) %>%
  tidyr::spread(ident, n)
write.table(sample_ncells, 
            paste0("../../results/", pathToTestType, "nCells/",
                   tolower(tissue),
                   "_cells_per_cluster_sample.txt"),
            quote = FALSE, sep = "\t")