Single nucleus

Processing of data from “Human and mouse single-nucleus transcriptomics reveal TREM2-dependent and TREM2-independent cellular responses in Alzheimer’s disease” Zhou et al. 2020
https://www.nature.com/articles/s41591-019-0695-9#data-availability

Set working directory

Load libraries

Set variables and thresholds

pathToRef <- "/research/labs/neurology/fryer/projects/references/mouse/refdata-gex-mm10-2020-A"
pathToTestType <- "7months/"
treatment <- "mouse"
tissue <- "Brain"
seqType <- "nucleus"
tool <- "cellranger"

# replicate groups
replicate <- "all"
replicate1 <- "replicate1"
replicate2 <- "replicate2"
replicate3 <- "replicate3"

# genotype
WT <- "WT"
Trem2_KO <- "Trem2_KO"
WT_5XFAD <- "WT_5XFAD"
Trem2_KO_5XFAD <- "Trem2_KO_5XFAD"
sample_order <- c(
      "WT_1",
      "WT_2",
      "WT_3",
      "Trem2_KO_1",
      "Trem2_KO_2",
      "Trem2_KO_3",
      "WT_5XFAD_1",
      "WT_5XFAD_2",
      "WT_5XFAD_3",
      "Trem2_KO_5XFAD_1",
      "Trem2_KO_5XFAD_2",
      "Trem2_KO_5XFAD_3"
)
remove(df)
## Warning in remove(df): object 'df' not found

QC thresholds as defined by the paper

nCount.min <- 300
nCount.max <- 9000
nFeature.min <- 300
nFeature.max <- 5600
mt.cutoff <- 5

Setup seurat object

Using CellBender filtered output.

prefix <- "../../counts/7months/"

if (tissue == "Brain" && file.exists(paste0("../../rObjects/", 
                        pathToTestType, "merged.rds"))) {
  data <- readRDS(paste0("../../rObjects/", 
                        pathToTestType, "merged.rds"))
} else if (tissue == "Brain") {
  # individual sample objects
  WT_1 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173504_WT_1/")))
  WT_2 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173505_WT_2/")))
  WT_3 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173506_WT_3/")))

  Trem2_KO_1 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173507_Trem2_KO_1/")))
  Trem2_KO_2 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173508_Trem2_KO_2/")))
  Trem2_KO_3 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173509_Trem2_KO_3/")))

  WT_5XFAD_1 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173510_WT_5XFAD_1/")))
  WT_5XFAD_2  <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173511_WT_5XFAD_2/")))
  WT_5XFAD_3  <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173512_WT_5XFAD_3/")))

  Trem2_KO_5XFAD_1 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173513_Trem2_KO_5XFAD_1/")))
  Trem2_KO_5XFAD_2 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173514_Trem2_KO_5XFAD_2/")))
  Trem2_KO_5XFAD_3 <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4173515_Trem2_KO_5XFAD_3/")))
  
  # merge objects
  data <- merge(
    x = WT_1,
    y = c(
      WT_2,
      WT_3,
      Trem2_KO_1,
      Trem2_KO_2,
      Trem2_KO_3,
      WT_5XFAD_1,
      WT_5XFAD_2,
      WT_5XFAD_3,
      Trem2_KO_5XFAD_1,
      Trem2_KO_5XFAD_2,
      Trem2_KO_5XFAD_3
    ),
    add.cell.ids = c(
      "WT_1",
      "WT_2",
      "WT_3",
      "Trem2_KO_1",
      "Trem2_KO_2",
      "Trem2_KO_3",
      "WT_5XFAD_1",
      "WT_5XFAD_2",
      "WT_5XFAD_3",
      "Trem2_KO_5XFAD_1",
      "Trem2_KO_5XFAD_2",
      "Trem2_KO_5XFAD_3"
    ),
    project = paste0("7months", replicate, " - Single Nucleus")
  )
  
  # cleanup and save
  remove(
    WT_1,
    WT_2,
    WT_3,
    Trem2_KO_1,
    Trem2_KO_2,
    Trem2_KO_3,
    WT_5XFAD_1,
    WT_5XFAD_2,
    WT_5XFAD_3,
    Trem2_KO_5XFAD_1,
    Trem2_KO_5XFAD_2,
    Trem2_KO_5XFAD_3
  )
  saveRDS(data,
          paste0("../../rObjects/", pathToTestType, "merged.rds"))
}

# preview
data
## An object of class Seurat 
## 31053 features across 90647 samples within 1 assay 
## Active assay: RNA (31053 features, 0 variable features)

nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique genes (features)

Add sample information to seurat object

# create sample column
barcodes <- colnames(data)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[, 2]
table(sample)
## sample
##       Trem2_KO_1       Trem2_KO_2       Trem2_KO_3 Trem2_KO_5XFAD_1 
##             7081             8243             8267             7002 
## Trem2_KO_5XFAD_2 Trem2_KO_5XFAD_3             WT_1             WT_2 
##             8163             6577            10122             6920 
##             WT_3       WT_5XFAD_1       WT_5XFAD_2       WT_5XFAD_3 
##             8175             7040             5018             8039
data$sample <- factor(
  sample,
  levels = c(
    "WT_1",
    "WT_2",
    "WT_3",
    "Trem2_KO_1",
    "Trem2_KO_2",
    "Trem2_KO_3",
    "WT_5XFAD_1",
    "WT_5XFAD_2",
    "WT_5XFAD_3",
    "Trem2_KO_5XFAD_1",
    "Trem2_KO_5XFAD_2",
    "Trem2_KO_5XFAD_3"
  )
)
table(data$sample)  # check
## 
##             WT_1             WT_2             WT_3       Trem2_KO_1 
##            10122             6920             8175             7081 
##       Trem2_KO_2       Trem2_KO_3       WT_5XFAD_1       WT_5XFAD_2 
##             8243             8267             7040             5018 
##       WT_5XFAD_3 Trem2_KO_5XFAD_1 Trem2_KO_5XFAD_2 Trem2_KO_5XFAD_3 
##             8039             7002             8163             6577
Idents(data) <- data$sample

Add genotype column to metadata

# create treatment column
geno <- gsub("WT_1", WT, data$sample)
geno <- gsub("WT_2", WT, geno)
geno <- gsub("WT_3", WT, geno)

geno <- gsub("Trem2_KO_1", Trem2_KO, geno)
geno <- gsub("Trem2_KO_2", Trem2_KO, geno)
geno <- gsub("Trem2_KO_3", Trem2_KO, geno)

geno <- gsub("WT_5XFAD_1", WT_5XFAD, geno)
geno <- gsub("WT_5XFAD_2", WT_5XFAD, geno)
geno <- gsub("WT_5XFAD_3", WT_5XFAD, geno)

geno <- gsub("Trem2_KO_5XFAD_1", Trem2_KO_5XFAD, geno)
geno <- gsub("Trem2_KO_5XFAD_2", Trem2_KO_5XFAD, geno)
geno <- gsub("Trem2_KO_5XFAD_3", Trem2_KO_5XFAD, geno)

data$genotype <-
  factor(geno, levels = c("WT", "Trem2_KO", "WT_5XFAD", "Trem2_KO_5XFAD"))
table(data$genotype)
## 
##             WT       Trem2_KO       WT_5XFAD Trem2_KO_5XFAD 
##          25217          23591          20097          21742

Add replicate column to metadata

# create treatment column
repl <- gsub("WT_1", replicate1, data$sample)
repl <- gsub("WT_2", replicate2, repl)
repl <- gsub("WT_3", replicate3, repl)

repl <- gsub("Trem2_KO_1", replicate1, repl)
repl <- gsub("Trem2_KO_2", replicate2, repl)
repl <- gsub("Trem2_KO_3", replicate3, repl)

repl <- gsub("WT_5XFAD_1", replicate1, repl)
repl <- gsub("WT_5XFAD_2", replicate2, repl)
repl <- gsub("WT_5XFAD_3", replicate3, repl)

repl <- gsub("Trem2_KO_5XFAD_1", replicate1, repl)
repl <- gsub("Trem2_KO_5XFAD_2", replicate2, repl)
repl <- gsub("Trem2_KO_5XFAD_3", replicate3, repl)

data$replicate <- 
  factor(repl, levels = c("replicate1", "replicate2", "replicate3"))
table(data$replicate )
## 
## replicate1 replicate2 replicate3 
##      31245      28344      31058

Obtain quality information

# cell.complexity
data$cell.complexity <-
  log10(data$nFeature_RNA) / log10(data$nCount_RNA)

# get gene names
gene.names <- rownames(data)
# percent MT
mt.genes <- gene.names[grep("^mt-", gene.names)]
mt.genes
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
data$percent.mt <- PercentageFeatureSet(data, pattern = "^mt-")

# ribosomal proteins begin with 'Rpl' or 'Rsl' for this annotation file
ribo.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
ribo.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"
data$percent.ribo <- PercentageFeatureSet(data, pattern = "^R[sp]l")
# mitochondrial ribosomes
mt.ribo.genes <- gene.names[grep("^Mrp[sl]", gene.names)]
mt.ribo.genes
##  [1] "Mrpl15"  "Mrpl30"  "Mrps9"   "Mrpl44"  "Mrps14"  "Mrpl41"  "Mrps2"  
##  [8] "Mrps5"   "Mrps26"  "Mrps28"  "Mrpl47"  "Mrpl24"  "Mrpl9"   "Mrps21" 
## [15] "Mrpl50"  "Mrpl37"  "Mrps15"  "Mrpl20"  "Mrpl33"  "Mrpl1"   "Mrps18c"
## [22] "Mrps17"  "Mrps33"  "Mrpl35"  "Mrpl19"  "Mrpl53"  "Mrps25"  "Mrpl51" 
## [29] "Mrps35"  "Mrps12"  "Mrpl46"  "Mrps11"  "Mrpl48"  "Mrpl17"  "Mrpl23" 
## [36] "Mrpl54"  "Mrpl42"  "Mrps31"  "Mrpl34"  "Mrps16"  "Mrpl52"  "Mrpl57" 
## [43] "Mrpl4"   "Mrps22"  "Mrpl3"   "Mrps24"  "Mrpl22"  "Mrpl55"  "Mrps23" 
## [50] "Mrpl27"  "Mrpl10"  "Mrpl45"  "Mrpl58"  "Mrps7"   "Mrpl38"  "Mrpl12" 
## [57] "Mrpl32"  "Mrpl36"  "Mrps27"  "Mrps36"  "Mrps30"  "Mrpl13"  "Mrpl40" 
## [64] "Mrpl39"  "Mrps6"   "Mrpl18"  "Mrps34"  "Mrpl28"  "Mrps18b" "Mrpl14" 
## [71] "Mrps18a" "Mrpl2"   "Mrps10"  "Mrpl21"  "Mrpl11"  "Mrpl49"  "Mrpl16" 
## [78] "Mrpl43"
data$percent.mt.ribo <-
  PercentageFeatureSet(data, pattern = "^Mrp[sl]")

# hemoglobin genes
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
hb.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"
data$percent.hb <-
  PercentageFeatureSet(data, pattern = "^Hb[ba]-")

# percent Ttr in each cell
data$percent.Ttr <- PercentageFeatureSet(data, features = "Ttr")

# percent Apoe in each cell
data$percent.Apoe <- PercentageFeatureSet(data, features = "Apoe")

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
cell_count_data <- as.data.frame(table(data$sample))
colnames(cell_count_data) <- c("sample", "frequency")

ncellsRaw <-
  ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency),
            position = position_dodge(width = 0.9),
            vjust = -0.25) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000),
                     limits = c(0, 12000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ncellsRaw

Density plots

QC metrics

# Visualize nCount_RNA
denCount <- ggplot(data@meta.data,
  aes(color = sample,
  x = nCount_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nCount_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.position = "none")

# Visualize nCount_RNA
denFeature <- ggplot(data@meta.data,
  aes(color = sample,
  x = nFeature_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  geom_vline(xintercept = nFeature.max) +
  theme(legend.position = "none")

# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity <- ggplot(data@meta.data,
  aes(color = sample,
  x = cell.complexity,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(denCount, denFeature, denCellComplexity)
layout <- rbind(c(1), c(2), c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# Visualize percent.mt
denMt <- ggplot(data@meta.data,
  aes(color = sample,
  x = percent.mt,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.ribo
denRb <- ggplot(data@meta.data,
  aes(color = sample,
  x = percent.ribo,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Ribosomal Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.hb
denHb <- ggplot(data@meta.data,
  aes(color = sample,
  x = percent.hb,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(denMt, denRb, denHb)
layout <- rbind(c(1), c(2), c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

### gene-level

# Visualize percent.Ttr
denTtr <- ggplot(data@meta.data,
  aes(color = sample,
  x = log2(percent.Ttr),
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("percent Ttr") +
  ylab("Density") +
  theme(legend.position = "none")
denTtr
## Warning: Removed 89822 rows containing non-finite values (stat_density).

# Visualize percent.Apoe
denApoe <- ggplot(data@meta.data,
  aes(color = sample,
  x = log2(percent.Apoe),
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("percent Apoe") +
  ylab("Density") +
  theme(legend.position = "none")
denApoe
## Warning: Removed 58385 rows containing non-finite values (stat_density).

Violin plots

# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(
  data,
  features = c("nFeature_RNA", "nCount_RNA", "cell.complexity"),
  ncol = 3,
  group.by = 'sample',
  pt.size = 0
)

#  nFeature_RNA
vFeature <- VlnPlot(
  data,
  features = c("nFeature_RNA"),
  group.by = 'sample',
  pt.size = 0
) + theme(legend.position = "none")
vFeature

#  nCount_RNA
vCount <- VlnPlot(
  data,
  features = c("nCount_RNA"),
  group.by = 'sample',
  pt.size = 0
) + theme(legend.position = "none")
vCount

#  cell.complexity
vCC <- VlnPlot(
  data,
  features = c("cell.complexity"),
  group.by = 'sample',
  pt.size = 0
) + theme(legend.position = "none")
vCC

# percent mt, ribo, hb
vMtRiboHb <- VlnPlot(
  data,
  features = c("percent.mt", "percent.ribo", "percent.hb"),
  ncol = 3,
  group.by = 'sample',
  pt.size = 0
)

#  percent mt
vMt <- VlnPlot(
  data,
  features = c("percent.mt"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vMt

#  percent ribosomal
vRb <- VlnPlot(
  data,
  features = c("percent.ribo"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vRb

#  percent hemoglobin genes
vHb <- VlnPlot(
  data,
  features = c("percent.hb"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vHb

# Ttr
vTtr <- VlnPlot(
  data,
  features = c("percent.Ttr"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vTtr

#  Apoe
vApoe <- VlnPlot(
  data,
  features = c("percent.Apoe"),
  group.by = 'sample',
  pt.size = 0.001
) + theme(legend.position = "none")
vApoe

Scatter plots

sMt <- ggplot(
  data@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  stat_smooth(method=lm) +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_vline(xintercept = nCount.max) +
  geom_hline(yintercept = nFeature.min) + 
  geom_hline(yintercept = nFeature.max) + 
  facet_wrap(~sample, ncol = 3) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt
## `geom_smooth()` using formula 'y ~ x'

sMt <- FeatureScatter(data,
               feature1 = "nCount_RNA",
               feature2 = "percent.mt",
               group.by = 'sample',
               shuffle = TRUE)
sMt

Filtering

Cell-level filtering

Filtering as defined by the manuscript

a cutoff of 300–9,000 UMI, 300–5,600 Gene was applied.

We will filter based on 6 conditions:
– nCount_RNA between 300 - 9,000 – nFeature_RNA between 300 - 5,600 – percent.mt < 5

# filter
data.filtered <- subset(
  data,
  subset = (nCount_RNA > nCount.min) &
    (nCount_RNA < nCount.max) &
    (nFeature_RNA > nFeature.min) &
    (nFeature_RNA < nFeature.max) &
    (percent.mt < mt.cutoff)
)

# print cells removed
print(paste0(dim(data)[2] - dim(data.filtered)[2], " cells removed"))
## [1] "7680 cells removed"
cell_count_data <- as.data.frame(table(data.filtered$sample))
colnames(cell_count_data) <- c("sample", "frequency")

ncellsFilter1 <-
  ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency),
  position = position_dodge(width = 0.9), vjust = -0.25) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000),
  limits = c(0, 12000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ncellsFilter1

# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFilter1)
layout <- cbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
counts <- GetAssayData(object = data.filtered, slot = "counts")
nonzero <- counts > 1  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite data.filtered
data.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = data.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "16034 features removed"
# which features (aka genes) were removed?
all.genes <- rownames(counts)
min.exp.genes <- rownames(counts.filtered)
genes.removed <- setdiff(all.genes, min.exp.genes)

Doublet filtering

https://github.com/chris-mcginnis-ucsf/DoubletFinder\

Heterotypic - doublets derived from transcriptionally distinct cells. DoubletFinder works best on this type of doublet.

Homotopic - Transcriptionally similar cell doublets. DoubletFinder does not work as great on this type of doublet.

pANN - proportion of artificial nearest neighbors (pANN)

BCMVN - mean-variance normalized bimodality coefficient of pANN distributions produced during pN -pK parameter sweeps. The BCMVN may be used to identify the pK parameter.

Overview of steps:
A. Prepare each sample
B. pK Identification (no ground-truth) - defines the PC neighborhood size used to compute pANN
C. Homotypic Doublet Proportion Estimate - homotypic doublets may not be a problem depending on the type of analysis you are performing. If you have some doublets of the same type and their counts are normalized, they will generally represent the profile of single cells of the same type.
D. DoubletFinder
E. Visualize where the doublets are located

save object

read in object

data.split <- readRDS(paste0("../../rObjects/", pathToTestType, 
              tolower(tissue),"_post_filter_split.rds"))
# converge data.split
data.singlets <- merge(
  x = data.split[[1]],
  y = c(
    data.split[[2]],
    data.split[[3]],
    data.split[[4]],
    data.split[[5]],
    data.split[[6]],
    data.split[[7]],
    data.split[[8]],
    data.split[[9]],
    data.split[[10]],
    data.split[[11]],
    data.split[[12]]
  ),
  project = paste0("Colonna data ", tissue, " Single Nucleus")
)

# print how many cells removed
print(paste0(dim(data.filtered)[2] - dim(data.singlets)[2], " cells removed"))
## [1] "2379 cells removed"
# overwrite data.filtered
data.filtered <- data.singlets

# reset levels
data.filtered$genotype <- factor(data.filtered$genotype,
                                 levels = c("WT", "Trem2_KO", "WT_5XFAD", "Trem2_KO_5XFAD"))

data.filtered$replicate <- factor(data.filtered$replicate,
                                  levels = c("replicate1", "replicate2", "replicate3"))

data.filtered$sample <- factor(
  data.filtered$sample,
  levels = c(
    "WT_1",
    "WT_2",
    "WT_3",
    "Trem2_KO_1",
    "Trem2_KO_2",
    "Trem2_KO_3",
    "WT_5XFAD_1",
    "WT_5XFAD_2",
    "WT_5XFAD_3",
    "Trem2_KO_5XFAD_1",
    "Trem2_KO_5XFAD_2",
    "Trem2_KO_5XFAD_3"
  )
)

# cleanup
remove(data.singlets,
       data.split,
       data_sample_singlets,
       data_sample)
## Warning in remove(data.singlets, data.split, data_sample_singlets, data_sample):
## object 'data_sample_singlets' not found
## Warning in remove(data.singlets, data.split, data_sample_singlets, data_sample):
## object 'data_sample' not found
remove(n_cells,
       n_cells_singlets,
       ncell_matrix,
       ncell_max,
       ncells_per_cluster,
       ncells_melt)
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'n_cells' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'n_cells_singlets' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncell_matrix' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncell_max' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncells_per_cluster' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncells_melt' not found
remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, sum.stdv, stdv)
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sweep.res.list' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sweep.stats' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'bcmvn' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'bcmvn_max' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'difference' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sum.stdv' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'stdv' not found
remove(d1, d2, d3, f1, f2, b1)
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd1' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd2' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd3' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'f1' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'f2' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'b1' not found
remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc)
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'co1' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'co2' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'cellmax' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'min.pc' not found

Histogram

All samples together

# User params
goi <- "Ttr"
sample_oi <- "ALL"
threshold <- 2

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(data.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <-
  paste0("Colonna Brain Cell: ",
         goi,
         " - ",
         sample_oi,
         "\nnCount_RNA > ",
         threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") + 
  theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0("Log2(", goi, " nCount_RNA)")) + ylab("# of cells") + 
  theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = log2.threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = log2.threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )

# plot
plots <- list(hist1, hist2)
layout <- rbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)
## 
## FALSE  TRUE 
## 80382   206

Malat1 all samples

# User params
goi <- "Malat1"
threshold <- 0.1

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(data.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <-
  paste0("Colonna Brain Cell: ", goi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of Samples") +
  theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
  geom_histogram(bins = 100, fill = "gray", color = "black") +
  labs(title = title, x = NULL, y = NULL) +
  xlab(paste0("Log2(", goi, " nCount_RNA)")) + ylab("# of Samples") +
  theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate(
    "rect",
    xmin = -Inf,
    xmax = log2.threshold,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "deepskyblue"
  ) +
  annotate(
    "rect",
    xmin = log2.threshold,
    xmax = Inf,
    ymin = 0,
    ymax = Inf,
    alpha = 0.2,
    fill = "chocolate4"
  )
# plot
plots <- list(hist1, hist2)
layout <- rbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)
## 
##  TRUE 
## 80588

Split by sample

Ttr by sample

Number of cells

# Visualize the number of cell counts per sample
cell_count_data <- as.data.frame(table(data.filtered$sample))
colnames(cell_count_data) <- c("sample", "frequency")

ncellsFiltered <-
  ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency),
            position = position_dodge(width = 0.9),
            vjust = -0.25) +
  scale_y_continuous(breaks = seq(0, 12000, by = 2000),
                     limits = c(0, 12000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFiltered)
layout <- cbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# Visualize nCount_RNA
denCount_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = nCount_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nCount_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.position = "none")

# Visualize nCount_RNA
denFeature_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = nFeature_RNA,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  geom_vline(xintercept = nFeature.max) +
  theme(legend.position = "none")

# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = cell.complexity,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.mt
denMt_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.mt,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(
  denCount,
  denFeature,
  denCellComplexity,
  denCount_filter,
  denFeature_filter,
  denCellComplexity_filter
)
layout <- rbind(c(1, 4), c(2, 5), c(3, 6))
grid_den_raw_filter <-
  grid.arrange(grobs = plots, layout_matrix = layout)

# Visualize percent.mt
denMt_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.mt,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.ribo
denRb_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.ribo,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Ribosomal Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Visualize percent.hb
denHb_filter <- ggplot(data.filtered@meta.data,
  aes(color = sample,
  x = percent.hb,
  fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.position = "none")

# Arrange graphs in grid
plots <- list(denMt, denRb, denHb, denMt_filter, denRb_filter, denHb_filter)
layout <- rbind(c(1, 4), c(2, 5), c(3, 6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

save

Density plots for genes of interest

# Visualize the number of counts per cell
denApoe_filter <- ggplot(data.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Apoe),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  xlab("percent Apoe") +
  ylab("Density") +
  theme(legend.position = "none")
denApoe_filter
## Warning: Removed 52609 rows containing non-finite values (stat_density).

denTtr_filter <- ggplot(data.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Ttr),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  xlab("percent Ttr") +
  ylab("Density") +
  theme(legend.position = "none")
denTtr_filter
## Warning: Removed 79880 rows containing non-finite values (stat_density).

# Arrange graphs in grid
plots <- list(denApoe, denTtr,  denApoe_filter, denTtr_filter)
layout <- rbind(c(1,3), c(2,4))
grid_den_raw_filter_genes <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning: Removed 58385 rows containing non-finite values (stat_density).
## Warning: Removed 89822 rows containing non-finite values (stat_density).
## Warning: Removed 52609 rows containing non-finite values (stat_density).
## Warning: Removed 79880 rows containing non-finite values (stat_density).

Violin plots

# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(data.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              pt.size = 0)
vCellInfo

#  percent Mt, ribo, hemo
vMtRiboHb <- VlnPlot(data.filtered,
              features = c("percent.mt","percent.ribo","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              pt.size = 0.001)
vMtRiboHb

#  percent Ttr
vTtr <- VlnPlot(data.filtered,
              features = c("percent.Ttr"),
              group.by = 'sample',
              pt.size = 0)
vTtr

#  percent Apoe
vApoe<- VlnPlot(data.filtered,
              features = c("percent.Apoe"),
              group.by = 'sample',
              pt.size = 0)
vApoe

Scatter plots

sMt <- ggplot(
  data.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  stat_smooth(method=lm) +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_vline(xintercept = nCount.max) + 
  geom_hline(yintercept = nFeature.min) + 
  geom_hline(yintercept = nFeature.max) +
  facet_wrap(~sample, ncol = 3) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt
## `geom_smooth()` using formula 'y ~ x'

sMt <- FeatureScatter(data.filtered,
               feature1 = "nCount_RNA",
               feature2 = "percent.mt",
               group.by = 'sample',
               shuffle = TRUE)
sMt

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(data.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  xlab("Sample") +
  theme(legend.position = "none")
b1

Top transcripts

df <- data.frame(row.names = rownames(data.filtered))
df$rsum <- rowSums(x = data.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 20)
##             rsum gene_name
## Malat1  20842434    Malat1
## Meg3     5598114      Meg3
## Snhg11   3863393    Snhg11
## Gm42418   893984   Gm42418
## Syt1      729718      Syt1
## Ptprd     713232     Ptprd
## Gria2     679050     Gria2
## Celf2     561434     Celf2
## Gm26917   525218   Gm26917
## Atp1b1    462025    Atp1b1
## Atp2b1    440574    Atp2b1
## Plp1      423624      Plp1
## App       418913       App
## Rtn1      398349      Rtn1
## Arpp21    393600    Arpp21
## Snrnp70   387262   Snrnp70
## Rsrp1     383717     Rsrp1
## Ank2      382574      Ank2
## Ttc3      378591      Ttc3
## Atp6v0b   375329   Atp6v0b

Next steps

For something to be informative, it needs to exhibit variation, but not all variation is informative. The goal of our clustering analysis is to keep the major sources of variation in our dataset that should define our cell types, while restricting the variation due to uninteresting sources of variation (sequencing depth, cell cycle differences, mitochondrial expression, batch effects, etc.). Then, to determine the cell types present, we will perform a clustering analysis using the most variable genes to define the major sources of variation in the dataset.

  1. Explain sources of unwanted variation: cell cycle, percent.mt (cell stress)
  2. Normalizing and regressing out sources of unwanted variation: sctransform
  3. Integration (OPTIONAL STEP): recommended when comparing groups, uses shared sources of high variation to identify shared sub-populations across conditions, we visualize after integration to ensure it’s good before clustering
    • Stuart and Butler et al. (2018)
  4. Clustering cells: cluster cells based on similarity of their gene expression profiles, cells are assigned to a cluster based on their PCA scores based on expression of integrated highly variable genes
  5. Cluster quality evaluation: see what clusters aren’t influenced by sources of uninteresting variation, check major PCs are driving clusters, explore cell identities by looking a known markers

Cell cycle

The most common biological data correction is to remove the effects of the cell cycle on the transcriptome. This data correction can be performed by a simple linear regression against a cell cycle score.

Check cell cycle phase BEFORE doing sctransform. Counts to need to be comparable between cells and each cell has a different number for nCount_RNA.

# summary of counts per cell
summary(data.filtered@meta.data$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     496    1507    3531    3668    5371    8988

Use the NormalizeData() function with the argument LogNormalze to account for sequencing depth. nCount_RNA for each gene is divided by the total nCount_RNA for that cell. This is done for all cells. This number is then multiplied by the scale.factor so we don’t have to deal with a tiny number. This number is then natural-log transformed using log1p. log1p is the natural logarithm (base e) of 1 + count. The 1 will prevent taking the log of 0.

data.phase <- NormalizeData(data.filtered,
                            scale.factor = 10000, # default
                            normalization.method = "LogNormalize" # default
                            )

Give each cell a score based on expression of G1, G2/M, and S phase markers. A list of markers is provided for humans. We will need to convert human gene names to mouse gene names. We will use the CellCycleScoring() function in seurat.

Below is a resource for acquiring cell markers for other organisms https://hbctraining.github.io/scRNA-seq_online/lessons/cell_cycle_scoring.html

G1 ~10 hrs S ~5-6 hrs G2 ~3-4 hrs M ~2 hrs

G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise the the greatest positive value between S.Score and G2M.Score determines the phase.

Using a human list for mouse https://github.com/satijalab/seurat/issues/2493

# Basic function to convert human to mouse gene names
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", 
                 values = x , mart = human, attributesL = c("mgi_symbol"), 
                 martL = mouse, uniqueRows=T)
humanx <- unique(genesV2[, 2])
# Print the first 6 genes found to the screen
print(head(humanx))
return(humanx)
}

m.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
m.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
saveRDS(m.s.genes, "../../results/geneLists/m.s.genes.rds")
saveRDS(m.g2m.genes, "../../results/geneLists/m.g2m.genes.rds")
m.s.genes <- readRDS("../../results/geneLists/m.s.genes.rds")
m.g2m.genes <- readRDS("../../results/geneLists/m.g2m.genes.rds")

# score cells for cell cycle
data.phase <- CellCycleScoring(data.phase,
                               g2m.features = m.g2m.genes,
                               s.features = m.s.genes,
                               set.ident = TRUE)
## Warning: The following features are not present in the object: Chaf1b, Exo1,
## Cdc45, Cdc6, Gins2, not searching for symbol synonyms
## Warning: The following features are not present in the object: Gtse1, Cdc25c,
## Cdk1, Cdca3, Cks1b, Kif2c, Ndc80, Pimreg, Aurka, not searching for symbol
## synonyms

Bar graph

cellcyclecount_barplot <- 
  as_tibble(data.phase[[]]) %>%
  ggplot(aes(Phase, fill = Phase)) + geom_bar()
cellcyclecount_barplot

percent.phase <- data.phase@meta.data %>%
  group_by(sample, Phase) %>%
  dplyr::count() %>%
  group_by(sample) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = sample, y = percent, fill = Phase)) +
  geom_col() +
  ggtitle("Percentage of phase per sample") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
percent.phase

Pie chart

# pie point 
cellcyclecount_piepoint <- 
  as_tibble(data.phase[[]]) %>%
  ggplot(aes(x=S.Score, y=G2M.Score, color=Phase)) + 
  geom_point()
cellcyclecount_piepoint

Top variable genes

Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.

# Identify the most variable genes
data.phase <- FindVariableFeatures(data.phase,
                                   selection.method = "vst",  # default vst
                                   nfeatures = 2000,  # default 2000
                                   verbose = FALSE)

# view top variable genes
top40 <- head(VariableFeatures(data.phase), 40)
top40
##  [1] "Vip"           "Ptgds"         "Vtn"           "Sst"          
##  [5] "9630013A20Rik" "Npy"           "Avp"           "Adarb2"       
##  [9] "Penk"          "Myoc"          "Ly6c1"         "Flt1"         
## [13] "Slc6a20a"      "Cxcl12"        "Rgs5"          "Atp1a2"       
## [17] "Cldn5"         "Pltp"          "Gal"           "Prg4"         
## [21] "Slco1a4"       "Slc5a7"        "Cnr1"          "Ly6a"         
## [25] "Tac2"          "Gm42047"       "Ctgf"          "Nnat"         
## [29] "Slc7a11"       "Vmn1r209"      "Nr4a2"         "F13a1"        
## [33] "Ttn"           "Cd74"          "Ndnf"          "Nts"          
## [37] "Acta2"         "Crh"           "Slc1a3"        "Dcn"
# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(data.phase, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot, 
                    points = top40, repel = TRUE, fontface="italic", 
                    xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel

Mean-variance

# The variability information can be accessed using the HVFInfo method. 
# The names of the variable features can be accessed with VariableFeatures().
variance.data <- as_tibble(HVFInfo(data.phase),rownames = "Gene")
variance.data <- variance.data %>% mutate(hypervariable=Gene %in% VariableFeatures(data.phase))

# We can plot out a graph of the variance vs mean and highlight the selected genes 
# this way, we can see whether we think we’re likely to capture what we need.
subset_data <- subset(variance.data, hypervariable == TRUE)
varGeneslog <- variance.data %>% 
  ggplot(aes(log(mean),log(variance),color=hypervariable)) + 
  geom_point() + 
  scale_color_manual(values=c("black","red")) +  geom_text_repel(
    data = subset_data, max.overlaps = 20,
    aes(
      x = log(mean),
      y = log(variance),
      label = Gene, 
      fontface="italic",),segment.alpha = 1,size = 4) +
  theme(legend.position="bottom")
varGeneslog

remove(subset_data, varGeneslog)

PCA

See if the cell cycle is a major source of variation using PCA. We scale the data because highly expressed genes exhibit the highest amount of variation and we don’t want our ‘highly variable genes’ only to reflect high expression.

vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then, feature values are standardized using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).

The ScaleData() function in Seurat will adjust gene expressions so that the mean expression in each cell is 0. It will also scale each gene to give a variance of 1 for each cell.

# Scale the counts
data.phase <- ScaleData(data.phase)
## Centering and scaling data matrix
data.phase@assays
## $RNA
## Assay data with 15019 features for 80588 cells
## Top 10 variable features:
##  Vip, Ptgds, Vtn, Sst, 9630013A20Rik, Npy, Avp, Adarb2, Penk, Myoc
data.phase.pca <- RunPCA(data.phase, nfeatures.print = 10)
## PC_ 1 
## Positive:  Snhg11, Meg3, Csmd1, Dlgap2, Ahi1, Celf2, Trank1, Kalrn, Arpp21, Sorbs2 
## Negative:  Neat1, Plp1, Qk, Mag, Apoe, Gatm, Cldn11, Mog, Glul, Mbp 
## PC_ 2 
## Positive:  Mag, Mog, Mbp, Cldn11, Plp1, Mal, Aspa, Pex5l, Enpp2, Tmeff2 
## Negative:  Gpr37l1, Ntsr2, Atp1a2, Slc7a10, Bcan, Slco1c1, Gja1, Plpp3, S1pr1, Fgfr3 
## PC_ 3 
## Positive:  Csf1r, C1qc, Arhgap45, Cyth4, Hexb, Inpp5d, Laptm5, P2ry12, C1qa, Hpgds 
## Negative:  Mbp, Mog, Enpp2, Gjc3, Mag, Aspa, Cldn11, Plp1, Car2, Gatm 
## PC_ 4 
## Positive:  Rgs9, Rarb, Adcy5, Gprin3, Rasgrp2, Actn2, Gpr88, Gm10754, Ryr3, Mhrt 
## Negative:  Slc17a7, Car10, Miat, Slc1a2, Mical2, Gm28928, Tcf4, Stx1a, Satb2, Kcnj6 
## PC_ 5 
## Positive:  Baiap2, Arpp21, Phactr1, Kcnh4, Slit3, Ngef, Pde7b, Arhgap32, Ankrd33b, Ptprd 
## Negative:  Nxph1, Kcnip1, Erbb4, Dner, Grip1, Grik1, Ankrd55, Kcnmb2, Dlx1as, Gad1

If the plots for each phase look very similar to each other, do not regress out variation due to cell cycle. You can plot PC1 vs PC2 before and after regression to see how effective it was. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

# Plot the PCA colored by cell cycle phase
cycle.pca.phase <- DimPlot(data.phase.pca,
        reduction = "pca",
        group.by= "Phase",
        split.by = "Phase")
cycle.pca.phase

cycle.pca <- DimPlot(data.phase.pca,
               reduction = "pca",
               group.by = "Phase",
               shuffle = TRUE)
cycle.pca

clean up seurat objects

remove(data_sample, data.filtered, data.filtered.split, data.phase.pca, cycle.pca.phase)
## Warning in remove(data_sample, data.filtered, data.filtered.split,
## data.phase.pca, : object 'data_sample' not found
## Warning in remove(data_sample, data.filtered, data.filtered.split,
## data.phase.pca, : object 'data.filtered.split' not found

SCTransform

Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model. ??Variance is also adjusted based on pooling information across genes with similar abundances??

Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase and so we chose not to regress this out of our data. We observed some effect of mitochondrial expression and so we choose to regress this out from the data.

Since we have several samples in our dataset, we want to keep them as separate objects and transform them as that is what is required for integration. We will first split the cells in seurat.phase object by sample.

data.split <- SplitObject(data.phase, split.by = "sample")
# check
data.split
## $WT_1
## An object of class Seurat 
## 15019 features across 9606 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $WT_2
## An object of class Seurat 
## 15019 features across 6298 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $WT_3
## An object of class Seurat 
## 15019 features across 7416 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $Trem2_KO_1
## An object of class Seurat 
## 15019 features across 6039 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $Trem2_KO_2
## An object of class Seurat 
## 15019 features across 7740 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $Trem2_KO_3
## An object of class Seurat 
## 15019 features across 7650 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $WT_5XFAD_1
## An object of class Seurat 
## 15019 features across 6329 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $WT_5XFAD_2
## An object of class Seurat 
## 15019 features across 4033 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $WT_5XFAD_3
## An object of class Seurat 
## 15019 features across 6100 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $Trem2_KO_5XFAD_1
## An object of class Seurat 
## 15019 features across 6062 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $Trem2_KO_5XFAD_2
## An object of class Seurat 
## 15019 features across 7372 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)
## 
## $Trem2_KO_5XFAD_3
## An object of class Seurat 
## 15019 features across 5943 samples within 1 assay 
## Active assay: RNA (15019 features, 2000 variable features)

Now we will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.

Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb) using the following code:

options(future.globals.maxSize = 4000 * 1024^5)

for (i in 1:length(data.split)) {
  print(paste0("Sample ", i))
  data.split[[i]] <- SCTransform(data.split[[i]], 
                                 verbose = FALSE
                                 )
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"
## [1] "Sample 9"
## [1] "Sample 10"
## [1] "Sample 11"
## [1] "Sample 12"

NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument.

Note, the last line of output specifies “Set default assay to SCT”. We can view the different assays that we have stored in our seurat object.

A thread about whether or not regress out batch: https://github.com/satijalab/seurat/issues/3270 It is suggested to not regress out batch, and instead use a data integration method

# Check
data.split
## $WT_1
## An object of class Seurat 
## 29997 features across 9606 samples within 2 assays 
## Active assay: SCT (14978 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_2
## An object of class Seurat 
## 29970 features across 6298 samples within 2 assays 
## Active assay: SCT (14951 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_3
## An object of class Seurat 
## 29968 features across 7416 samples within 2 assays 
## Active assay: SCT (14949 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_1
## An object of class Seurat 
## 29959 features across 6039 samples within 2 assays 
## Active assay: SCT (14940 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_2
## An object of class Seurat 
## 29977 features across 7740 samples within 2 assays 
## Active assay: SCT (14958 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_3
## An object of class Seurat 
## 29982 features across 7650 samples within 2 assays 
## Active assay: SCT (14963 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_5XFAD_1
## An object of class Seurat 
## 29966 features across 6329 samples within 2 assays 
## Active assay: SCT (14947 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_5XFAD_2
## An object of class Seurat 
## 29871 features across 4033 samples within 2 assays 
## Active assay: SCT (14852 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $WT_5XFAD_3
## An object of class Seurat 
## 29990 features across 6100 samples within 2 assays 
## Active assay: SCT (14971 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_5XFAD_1
## An object of class Seurat 
## 29977 features across 6062 samples within 2 assays 
## Active assay: SCT (14958 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_5XFAD_2
## An object of class Seurat 
## 30016 features across 7372 samples within 2 assays 
## Active assay: SCT (14997 features, 3000 variable features)
##  1 other assay present: RNA
## 
## $Trem2_KO_5XFAD_3
## An object of class Seurat 
## 29986 features across 5943 samples within 2 assays 
## Active assay: SCT (14967 features, 3000 variable features)
##  1 other assay present: RNA

Integration

Run harmony

Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.

To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.

Now, using our SCTransform object as input, let’s perform the integration across conditions.

First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.

# Choose the features to use when integrating multiple datasets. 
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = data.split, 
                                          nfeatures = 3000)

# merge the data
data.sct.merged <- merge(x = data.split[[1]],
  y = c(
    data.split[[2]],
    data.split[[3]],
    data.split[[4]],
    data.split[[5]],
    data.split[[6]],
    data.split[[7]],
    data.split[[8]],
    data.split[[9]],
    data.split[[10]],
    data.split[[11]],
    data.split[[12]]
  ),
  project = paste0("Colonna data ", tissue, " Single Nucleus")
)

# define the variable features 
VariableFeatures(data.sct.merged) <- var.features

save rObject

insepct data

data.sct.merged <- RunPCA(data.sct.merged, assay = "SCT")
## PC_ 1 
## Positive:  Plp1, Mbp, Neat1, Ptgds, Mag, Cldn11, Mog, Mal, Apod, Gatm 
##     Enpp2, Aspa, Gjc3, Car2, Ugt8a, Scd2, Mobp, Pde4b, Kctd13, C030029H02Rik 
##     Hapln2, Litaf, Fa2h, Qk, Tubb4a, Sept4, Plekhh1, 1700047M11Rik, Tspan2, Edil3 
## Negative:  Celf2, Syt1, Gria2, Pde10a, Arpp21, Kcnip4, Phactr1, Kalrn, Csmd1, Ahi1 
##     Atp2b1, Mir124a-1hg, Dlgap1, Snap25, Atp1a3, Opcml, Sorbs2, R3hdm1, Lrrc7, Nrxn3 
##     Camk2a, Atp1b1, Mef2c, Epha5, Rtn1, Grin2b, AC149090.1, Grin2a, Dlgap2, Cacna2d3 
## PC_ 2 
## Positive:  Atp1a2, Slc1a3, Slc1a2, Cst3, Apoe, Plpp3, Gpr37l1, Bcan, Ptprz1, Ntsr2 
##     Clu, Sparcl1, Gja1, Mfge8, S1pr1, Slc7a10, Htra1, Luzp2, Glul, Slco1c1 
##     F3, Prex2, Sat1, Slc7a11, Pla2g7, Fgfr3, Slc4a4, Mertk, Gpc5, Acsl3 
## Negative:  Plp1, Mbp, Ptprd, Mag, Cldn11, Mog, Mal, Enpp2, Pex5l, Aspa 
##     Ptgds, Edil3, Syt1, App, Gatm, Tmeff2, Apod, Phactr1, Arpp21, Ugt8a 
##     C030029H02Rik, Mobp, Sept4, Kctd13, Aplp1, Hapln2, Slc24a2, Neat1, Kcnip4, Tubb4a 
## PC_ 3 
## Positive:  Slc1a2, Atp1a2, Slc1a3, Plpp3, Gpr37l1, Bcan, Ntsr2, Ptprz1, Sparcl1, Clu 
##     Htra1, Slc7a10, Luzp2, Gja1, Mfge8, Apoe, S1pr1, Slco1c1, Prex2, Gm3764 
##     F3, Slc4a4, Glul, Acsl3, Fgfr3, Slc7a11, Pla2g7, Gpc5, Dclk1, Prdx6 
## Negative:  Hexb, Csf1r, C1qc, P2ry12, C1qa, Laptm5, Cx3cr1, Unc93b1, Arhgap45, C1qb 
##     Cyth4, Ctss, Inpp5d, Siglech, Ly86, Hpgds, Selplg, Tgfbr1, Fcrls, Cd37 
##     Apbb1ip, Fyb, Trem2, Lair1, Ltc4s, Cst3, Ptprc, P2ry6, Slco2b1, Il10ra 
## PC_ 4 
## Positive:  Ptprd, Tcf4, Mef2c, Dlgap1, Miat, Gm28928, Car10, Syt1, Slc17a7, Nrxn1 
##     Opcml, Slc1a2, Epha5, Kcnh7, Kalrn, Etl4, Olfm1, Snap25, Kcnip4, Sparcl1 
##     Tmem145, Fam19a1, Oxr1, Gabra1, Elavl2, Ptprk, Coro6, Clstn2, BC030499, Chgb 
## Negative:  Pde10a, Rgs9, Penk, Phactr1, Gpr88, Adcy5, Rarb, Gprin3, Adora2a, Meis2 
##     Cacnb2, Dgkb, Pde7b, Ryr3, Actn2, Scn4b, Ppp1r1b, Rasgrp2, Cacna2d3, Pde1b 
##     Spock3, Gm10754, Foxp1, Atp2b1, Strn, Tac1, Mhrt, Syndig1l, Cnr1, Dach1 
## PC_ 5 
## Positive:  Pdgfra, Ptprz1, Lhfpl3, Vcan, Cspg4, Adarb2, Nxph1, Serpine2, Tnr, C1ql1 
##     Gpr17, Matn4, Sox6, Neu4, Olig1, Erbb4, Emid1, Arhgap31, Pcdh15, Sox10 
##     Gjc3, Ednrb, Cspg5, Kcnip1, A930009A15Rik, Bcan, Gad1, Epn2, Nnat, Slc6a1 
## Negative:  Slc1a2, Ptprd, Ntsr2, Glul, Kalrn, Slc1a3, R3hdm1, Arpp21, S1pr1, Gja1 
##     Rorb, Celf2, Slco1c1, Cst3, Prex2, Dclk1, Camk2a, Kcnip4, Mef2c, Gm28928 
##     Prdx6, Fgfr3, Slc4a4, Phkg1, Slc7a10, Mfge8, Sat1, A830036E02Rik, Neat1, Mbp

Plot PCA

pcareplicate <- DimPlot(data.sct.merged,
        reduction = "pca",
        group.by = "replicate")
pcareplicate

pcareplicate_split <- DimPlot(data.sct.merged,
        reduction = "pca",
        split.by = "replicate",
        group.by = "replicate")
pcareplicate_split

pcagenotype <- DimPlot(data.sct.merged,
        reduction = "pca",
        group.by = "genotype")
pcagenotype

pcagenotype_split <- DimPlot(data.sct.merged,
        reduction = "pca",
        split.by = "genotype",
        group.by = "genotype")
pcagenotype_split

pcasample <- DimPlot(data.sct.merged,
        reduction = "pca",
        split.by = "sample",
        group.by = "sample")
pcasample

pcasample_split <- DimPlot(data.sct.merged,
        reduction = "pca",
        group.by = "sample",
        shuffle = TRUE)
pcasample_split

# clean up
remove(pcareplicate, pcareplicate_split, pcagenotype, pcagenotype_split, pcasample, pcasample_split)

run harmony

# run harmony to harmonize over samples 
data.integrated <- RunHarmony(object = data.sct.merged, 
                              group.by.vars = "sample", 
                              assay.use = "SCT",
                              reduction = "pca", 
                              plot_convergence = TRUE)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4029400)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony 9/10
## Harmony 10/10
## Harmony converged after 10 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

Check output

# first put the data back in place
DefaultAssay(data.integrated) <- "SCT"
Idents(data.integrated) <- "sample"
data.integrated$sample <- factor(data.integrated$sample, 
                                  levels = sample_order)
# check PCA
p1 <- DimPlot(object = data.integrated, 
              reduction = "harmony",
              group.by = "sample",
              shuffle = TRUE)
p1

p2 <- VlnPlot(object = data.integrated, 
              features = "harmony_1", 
              group.by = "replicate", 
              pt.size = 0) 

p3 <- VlnPlot(object = data.integrated, 
              features = "harmony_1", 
              group.by = "replicate", 
              pt.size = 0) + NoLegend()
p3

# clean up
remove(p1, p2, p3)

Top variable features

Top 20 variable features

top20 <- data.integrated@assays$SCT@var.features[1:20]
top20
##  [1] "Plp1"     "Ptgds"    "Atp1a2"   "Mbp"      "Vip"      "Adarb2"  
##  [7] "Penk"     "Slc1a3"   "Sst"      "Apoe"     "Slc1a2"   "Neat1"   
## [13] "Vtn"      "Hexb"     "Cst3"     "Ptprz1"   "Npy"      "Slc6a20a"
## [19] "Slc7a11"  "Apod"

PCA plot

After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.

To generate these visualizations with the harmony output, use reduction = “harmony”

# Plot PCA
pcaReplicate <- DimPlot(data.integrated,
        reduction = "harmony",
        group.by = "replicate")
pcaReplicate

pcaReplicateSplit <- DimPlot(data.integrated,
        reduction = "harmony",
        split.by = "replicate",
        group.by = "replicate")
pcaReplicateSplit

pcaGenotype <- DimPlot(data.integrated,
        reduction = "harmony",
        group.by = "genotype",
        shuffle = TRUE)
pcaGenotype

pcaGenotypeSplit <- DimPlot(data.integrated,
        reduction = "harmony",
        split.by = "genotype",
        group.by = "genotype",
        shuffle = TRUE)
pcaGenotypeSplit

pcaSample <- DimPlot(data.integrated,
        reduction = "harmony",
        group.by = "sample",
        shuffle = TRUE)
pcaSample

pcaSampleSplit <- DimPlot(data.integrated,
        reduction = "harmony",
        split.by = "sample",
        group.by = "sample")
pcaSampleSplit

Find significant PCs

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# Printing out the most variable genes driving PCs
print(x = data.integrated[["pca"]], 
      dims = 1:10, 
      nfeatures = 10)
## PC_ 1 
## Positive:  Plp1, Mbp, Neat1, Ptgds, Mag, Cldn11, Mog, Mal, Apod, Gatm 
## Negative:  Celf2, Syt1, Gria2, Pde10a, Arpp21, Kcnip4, Phactr1, Kalrn, Csmd1, Ahi1 
## PC_ 2 
## Positive:  Atp1a2, Slc1a3, Slc1a2, Cst3, Apoe, Plpp3, Gpr37l1, Bcan, Ptprz1, Ntsr2 
## Negative:  Plp1, Mbp, Ptprd, Mag, Cldn11, Mog, Mal, Enpp2, Pex5l, Aspa 
## PC_ 3 
## Positive:  Slc1a2, Atp1a2, Slc1a3, Plpp3, Gpr37l1, Bcan, Ntsr2, Ptprz1, Sparcl1, Clu 
## Negative:  Hexb, Csf1r, C1qc, P2ry12, C1qa, Laptm5, Cx3cr1, Unc93b1, Arhgap45, C1qb 
## PC_ 4 
## Positive:  Ptprd, Tcf4, Mef2c, Dlgap1, Miat, Gm28928, Car10, Syt1, Slc17a7, Nrxn1 
## Negative:  Pde10a, Rgs9, Penk, Phactr1, Gpr88, Adcy5, Rarb, Gprin3, Adora2a, Meis2 
## PC_ 5 
## Positive:  Pdgfra, Ptprz1, Lhfpl3, Vcan, Cspg4, Adarb2, Nxph1, Serpine2, Tnr, C1ql1 
## Negative:  Slc1a2, Ptprd, Ntsr2, Glul, Kalrn, Slc1a3, R3hdm1, Arpp21, S1pr1, Gja1 
## PC_ 6 
## Positive:  Pdgfra, Lhfpl3, Cspg4, Vcan, Ptprz1, Ptprd, Tnr, Arpp21, Matn4, Gpr17 
## Negative:  Erbb4, Adarb2, Dlx6os1, Gad1, Slc6a1, Atp1b1, Kcnip1, Slc32a1, Galntl6, Nxph1 
## PC_ 7 
## Positive:  Ptgds, Slc6a20a, mt-Co1, Vtn, mt-Co3, mt-Cytb, mt-Atp8, mt-Co2, mt-Nd4l, Aebp1 
## Negative:  Ptprz1, Bcan, Luzp2, Gpr37l1, Erbb4, Adarb2, Slc6a1, Nxph1, Ntsr2, Gm3764 
## PC_ 8 
## Positive:  A830036E02Rik, Mef2c, Lingo2, BC030499, Fam19a1, Kcnip4, Rorb, Ptprd, 2900055J20Rik, Cntn5 
## Negative:  Thsd7b, Zfpm2, Sdk2, Gm28928, Foxp2, mt-Co3, mt-Co1, Slc1a2, mt-Atp8, mt-Cytb 
## PC_ 9 
## Positive:  Sparcl1, mt-Co1, mt-Co3, Syt1, mt-Atp8, Chgb, mt-Co2, Pde10a, mt-Cytb, Atp2b1 
## Negative:  Nnat, C130073E24Rik, Hap1, Cpne7, AC124490.1, Camk2d, Baiap3, Ly6h, Celf4, Nrp2 
## PC_ 10 
## Positive:  Cemip, Nxph1, Coro6, Moxd1, Etl4, Sox6, Me3, Kcnc1, Kcnc2, Cntnap4 
## Negative:  Adarb2, Vip, Cnr1, mt-Co1, mt-Co3, mt-Atp8, mt-Co2, mt-Cytb, mt-Nd4l, Kit

Quantitative approach to an elbow plot - The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation. - The point where the percent change in variation between the consecutive PCs is less than 0.1%.

First metric

# Determine percent of variation associated with each PC
stdv <- data.integrated[["pca"]]@stdev
sum.stdv <- sum(data.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100

# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)

# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 41

Second metric

# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
  (percent.stdv[1:length(percent.stdv) - 1] - 
     percent.stdv[2:length(percent.stdv)]) > 0.1), 
  decreasing = T)[1] + 1

# last point where change of % of variation is more than 0.1%.
co2
## [1] 13

Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.

# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 13

Elbow plot

Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

UMAP

# Run UMAP
data.integrated <- RunUMAP(data.integrated,
                           dims = 1:min.pc,
                           reduction = "harmony",
                           n.components = 3) # set to 3 to use with VR
DimPlot(data.integrated,
        group.by = "replicate",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "replicate",
        split.by = "replicate",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "genotype",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "genotype",
        split.by = "genotype",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "sample",
        shuffle = TRUE)

DimPlot(data.integrated,
        group.by = "sample",
        split.by = "sample",
        shuffle = TRUE) + 
        NoLegend()

FeaturePlot(data.integrated, 
            features = "Apoe")

FeaturePlot(data.integrated, 
            features = "Ttr")

Cluster the cells

Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].

We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.

The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.

# Determine the K-nearest neighbor graph
data.unannotated <- FindNeighbors(object = data.integrated, 
                                 assay = "SCT", # set as default after SCTransform
                                 reduction = "harmony",
                                 dims = 1:min.pc)

# Determine the clusters for various resolutions
data.unannotated <- FindClusters(object = data.unannotated,
                                 algorithm = 1, # 1= Louvain
                                 resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9781
## Number of communities: 13
## Elapsed time: 28 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9643
## Number of communities: 16
## Elapsed time: 32 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9546
## Number of communities: 19
## Elapsed time: 30 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9461
## Number of communities: 23
## Elapsed time: 31 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9387
## Number of communities: 26
## Elapsed time: 31 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9318
## Number of communities: 25
## Elapsed time: 27 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9258
## Number of communities: 28
## Elapsed time: 39 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80588
## Number of edges: 2639799
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9198
## Number of communities: 29
## Elapsed time: 28 seconds

Explore resolutions

# 0.1
umap0.1 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE)
umap0.1

# 0.2
umap0.2 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE)
umap0.2

# 0.3
umap0.3 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE)
umap0.3

# 0.4
umap0.4 <- DimPlot(data.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE)
umap0.4

Clustering QC

Treatment, sample, phase

# replicate
ureplicate <- DimPlot(data.unannotated, 
        label = FALSE, 
        group.by = "SCT_snn_res.0.1",
        split.by = "replicate") +
  NoLegend()
ureplicate

# replicate
ugenotype <- DimPlot(data.unannotated, 
        label = FALSE, 
        group.by = "SCT_snn_res.0.1",
        split.by = "genotype") +
  NoLegend()
ugenotype

# sample
usample <- DimPlot(data.unannotated, 
        label = FALSE,
        group.by = "SCT_snn_res.0.1",
        split.by = "sample") +
  NoLegend()
usample

# phase
uphase <- DimPlot(data.unannotated, 
        label = FALSE,
        group.by = "SCT_snn_res.0.1",
        split.by = "Phase") +
  NoLegend()
uphase

Percent of cell type markers

data.unannotated[["percent.neuron"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Snap25", "Syt1", "Gad1", "Gad2"))
data.unannotated[["percent.astrocytes"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Clu", "Gfap", "Aqp4"))
data.unannotated[["percent.microglia"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Hexb", "C1qa"))
data.unannotated[["percent.DAM"]] <-
  PercentageFeatureSet(data.unannotated,
                       features = c("Apoe", "Cst7", "Lyz2", "Lpl", "Cd9", "Trem2"))
data.unannotated[["percent.homeostatic"]] <-
  PercentageFeatureSet(data.unannotated, features = c("P2ry12", "Cx3cr1"))
data.unannotated[["percent.oligodendrocytes"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Plp1", "Mbp", "Mag"))
data.unannotated[["percent.OPCs"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Olig1", "Pdgfra", "Vcan"))
data.unannotated[["percent.macrophages"]] <-
  PercentageFeatureSet(data.unannotated, features = c("F13a1", "Mrc1"))
data.unannotated[["percent.endothelial"]] <-
  PercentageFeatureSet(data.unannotated, features = c("Igfbp7", "Fn1", "Sox17"))

FeaturePlot(data.unannotated, features = "percent.neuron")

FeaturePlot(data.unannotated, features = "percent.astrocytes")

FeaturePlot(data.unannotated, features = "percent.microglia")

FeaturePlot(data.unannotated, features = "percent.DAM")

FeaturePlot(data.unannotated, features = "percent.homeostatic")

FeaturePlot(data.unannotated, features = "percent.oligodendrocytes")

FeaturePlot(data.unannotated, features = "percent.OPCs")

FeaturePlot(data.unannotated, features = "percent.macrophages")

FeaturePlot(data.unannotated, features = "percent.endothelial")

## Revist QC metrics

# nCount
fnCount <- FeaturePlot(data.unannotated, features = "nCount_RNA", order = TRUE)
fnCount

# nFeature
fnFeature <- FeaturePlot(data.unannotated, features = "nFeature_RNA", order = TRUE)
fnFeature

# percent.mt
fpercent.mt <- FeaturePlot(data.unannotated, features = "percent.mt", order = TRUE)
fpercent.mt

# cell.complexity
fcell.complexity <- FeaturePlot(data.unannotated, features = "cell.complexity", order = TRUE)
fcell.complexity

fS.Score <- FeaturePlot(data.unannotated, features = "S.Score", order = TRUE)
fS.Score

fG2M.Score <- FeaturePlot(data.unannotated, features = "G2M.Score", order = TRUE)
fG2M.Score

Percent cells per cluster

data.unannotated@meta.data$seurat_clusters <- 
  data.unannotated@meta.data$SCT_snn_res.0.1

# replicate
breplicate <- data.unannotated@meta.data %>%
  group_by(seurat_clusters, replicate) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=replicate)) +
  geom_col() +
  ggtitle("Percentage of replicate per cluster")
breplicate

# genotype
bgenotype <- data.unannotated@meta.data %>%
  group_by(seurat_clusters, genotype) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=genotype)) +
  geom_col() +
  ggtitle("Percentage of genotype per cluster")
bgenotype

# sample
bsample <- data.unannotated@meta.data %>%
  group_by(seurat_clusters, sample) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
  geom_col() +
  ggtitle("Percentage of sample per cluster")
bsample

Number cells per cluster

genotype_ncells <- FetchData(data.unannotated, 
                     vars = c("ident", "genotype")) %>%
  dplyr::count(ident, genotype) %>%
  tidyr::spread(ident, n)
write.table(genotype_ncells, 
            paste0("../../results/", pathToTestType, "nCells/",
                   tolower(tissue),
                   "_cells_per_cluster_genotype.txt"),
            quote = FALSE, sep = "\t")

sample_ncells <- FetchData(data.unannotated, 
                     vars = c("ident", "sample")) %>%
  dplyr::count(ident,sample) %>%
  tidyr::spread(ident, n)
write.table(sample_ncells, 
            paste0("../../results/", pathToTestType, "nCells/",
                   tolower(tissue),
                   "_cells_per_cluster_sample.txt"),
            quote = FALSE, sep = "\t")