Processing of data from “Human and mouse single-nucleus transcriptomics reveal TREM2-dependent and TREM2-independent cellular responses in Alzheimer’s disease” Zhou et al. 2020
https://www.nature.com/articles/s41591-019-0695-9#data-availability
pathToRef <- "/research/labs/neurology/fryer/projects/references/mouse/refdata-gex-mm10-2020-A"
pathToTestType <- "15months/"
treatment <- "mouse"
tissue <- "Brain"
seqType <- "nucleus"
tool <- "cellranger"
# brain_region groups
brain_region <- "all"
brain_region1 <- "Cor"
brain_region2 <- "Hip"
# genotype
WT <- "WT"
Trem2_KO <- "Trem2_KO"
WT_5XFAD <- "WT_5XFAD"
Trem2_KO_5XFAD <- "Trem2_KO_5XFAD"
sample_order <- c(
"WT_Cor",
"Trem2_KO_Cor",
"WT_5XFAD_Cor",
"Trem2_KO_5XFAD_Cor",
"WT_Hip",
"Trem2_KO_Hip",
"WT_5XFAD_Hip",
"Trem2_KO_5XFAD_Hip"
)
nCount.min <- 300
nCount.max <- 9000
nFeature.min <- 300
nFeature.max <- 5600
mt.cutoff <- 5
Using CellBender filtered output.
prefix <- "../../counts/15months/"
if (tissue == "Brain" && file.exists(paste0("../../rObjects/",
pathToTestType, "merged.rds"))) {
data <- readRDS(paste0("../../rObjects/",
pathToTestType, "merged.rds"))
} else if (tissue == "Brain") {
# individual sample objects
WT_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160643_WT_Cor/")))
Trem2_KO_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160644_Trem2_KO_Cor/")))
WT_5XFAD_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160645_WT_5XFAD_Cor/")))
Trem2_KO_5XFAD_Cor <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160646_Trem2_KO_5XFAD_Cor/")))
WT_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160647_WT_Hip/")))
Trem2_KO_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160648_Trem2_KO_Hip/")))
WT_5XFAD_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160649_WT_5XFAD_Hip/")))
Trem2_KO_5XFAD_Hip <- CreateSeuratObject(Read10X(paste0(prefix,"GSM4160650_Trem2_KO_5XFAD_Hip/")))
# merge objects
data <- merge(
x = WT_Cor,
y = c(
Trem2_KO_Cor,
WT_5XFAD_Cor,
Trem2_KO_5XFAD_Cor,
WT_Hip,
Trem2_KO_Hip,
WT_5XFAD_Hip,
Trem2_KO_5XFAD_Hip
),
add.cell.ids = c(
"WT_Cor",
"Trem2_KO_Cor",
"WT_5XFAD_Cor",
"Trem2_KO_5XFAD_Cor",
"WT_Hip",
"Trem2_KO_Hip",
"WT_5XFAD_Hip",
"Trem2_KO_5XFAD_Hip"
),
project = paste0("15months", brain_region, " - Single Nucleus")
)
# cleanup and save
remove(
WT_Cor,
Trem2_KO_Cor,
WT_5XFAD_Cor,
Trem2_KO_5XFAD_Cor,
WT_Hip,
Trem2_KO_Hip,
WT_5XFAD_Hip,
Trem2_KO_5XFAD_Hip
)
saveRDS(data,
paste0("../../rObjects/", pathToTestType, "merged.rds"))
}
# preview
data
## An object of class Seurat
## 27998 features across 42520 samples within 1 assay
## Active assay: RNA (27998 features, 0 variable features)
nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique genes (features)
# create sample column
barcodes <- colnames(data)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[, 2]
table(sample)
## sample
## Trem2_KO_5XFAD_Cor Trem2_KO_5XFAD_Hip Trem2_KO_Cor Trem2_KO_Hip
## 6670 3215 6912 2959
## WT_5XFAD_Cor WT_5XFAD_Hip WT_Cor WT_Hip
## 4816 4010 10549 3389
data$sample <- factor(
sample,
levels = c(
"WT_Cor",
"Trem2_KO_Cor",
"WT_5XFAD_Cor",
"Trem2_KO_5XFAD_Cor",
"WT_Hip",
"Trem2_KO_Hip",
"WT_5XFAD_Hip",
"Trem2_KO_5XFAD_Hip"
)
)
table(data$sample) # check
##
## WT_Cor Trem2_KO_Cor WT_5XFAD_Cor Trem2_KO_5XFAD_Cor
## 10549 6912 4816 6670
## WT_Hip Trem2_KO_Hip WT_5XFAD_Hip Trem2_KO_5XFAD_Hip
## 3389 2959 4010 3215
Idents(data) <- data$sample
Add genotype column to metadata
# create treatment column
geno <- gsub("WT_Cor", WT, data$sample)
geno <- gsub("WT_Hip", WT, geno)
geno <- gsub("Trem2_KO_Cor", Trem2_KO, geno)
geno <- gsub("Trem2_KO_Hip", Trem2_KO, geno)
geno <- gsub("WT_5XFAD_Cor", WT_5XFAD, geno)
geno <- gsub("WT_5XFAD_Hip", WT_5XFAD, geno)
geno <- gsub("Trem2_KO_5XFAD_Cor", Trem2_KO_5XFAD, geno)
geno <- gsub("Trem2_KO_5XFAD_Hip", Trem2_KO_5XFAD, geno)
data$genotype <-
factor(geno, levels = c(WT, Trem2_KO, WT_5XFAD, Trem2_KO_5XFAD))
table(data$genotype)
##
## WT Trem2_KO WT_5XFAD Trem2_KO_5XFAD
## 13938 9871 8826 9885
Add brain_region column to metadata
# create treatment column
repl <- gsub("WT_Cor", brain_region1, data$sample)
repl <- gsub("Trem2_KO_Cor", brain_region1, repl)
repl <- gsub("WT_5XFAD_Cor", brain_region1, repl)
repl <- gsub("Trem2_KO_5XFAD_Cor", brain_region1, repl)
repl <- gsub("WT_Hip", brain_region2, repl)
repl <- gsub("Trem2_KO_Hip", brain_region2, repl)
repl <- gsub("WT_5XFAD_Hip", brain_region2, repl)
repl <- gsub("Trem2_KO_5XFAD_Hip", brain_region2, repl)
data$brain_region <-
factor(repl, levels = c(brain_region1, brain_region2))
table(data$brain_region )
##
## Cor Hip
## 28947 13573
# cell.complexity
data$cell.complexity <-
log10(data$nFeature_RNA) / log10(data$nCount_RNA)
# get gene names
gene.names <- rownames(data)
# percent MT
mt.genes <- gene.names[grep("^mt-", gene.names)]
mt.genes
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
data$percent.mt <- PercentageFeatureSet(data, pattern = "^mt-")
# ribosomal proteins begin with 'Rpl' or 'Rsl' for this annotation file
ribo.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
ribo.genes
## [1] "Hbb-bt" "Hbb-bs" "Hbb-bh2" "Hbb-bh1" "Hbb-y" "Hba-x" "Hba-a1"
## [8] "Hba-a2"
data$percent.ribo <- PercentageFeatureSet(data, pattern = "^R[sp]l")
# mitochondrial ribosomes
mt.ribo.genes <- gene.names[grep("^Mrp[sl]", gene.names)]
mt.ribo.genes
## [1] "Mrpl15" "Mrpl30" "Mrps9" "Mrpl44" "Mrps14" "Mrpl41" "Mrps2"
## [8] "Mrps5" "Mrps26" "Mrps28" "Mrpl47" "Mrpl24" "Mrpl9" "Mrps21"
## [15] "Mrpl50" "Mrpl37" "Mrps15" "Mrpl20" "Mrpl33" "Mrpl1" "Mrps18c"
## [22] "Mrps17" "Mrps33" "Mrpl35" "Mrpl19" "Mrpl53" "Mrps25" "Mrpl51"
## [29] "Mrps35" "Mrps12" "Mrpl46" "Mrps11" "Mrpl48" "Mrpl17" "Mrpl23"
## [36] "Mrpl54" "Mrpl42" "Mrps31" "Mrpl34" "Mrps16" "Mrpl52" "Mrpl57"
## [43] "Mrpl4" "Mrps22" "Mrpl3" "Mrps24" "Mrpl22" "Mrpl55" "Mrps23"
## [50] "Mrpl27" "Mrpl10" "Mrpl45" "Mrps7" "Mrpl38" "Mrpl12" "Mrpl32"
## [57] "Mrpl36" "Mrps27" "Mrps36" "Mrps30" "Mrpl13" "Mrpl40" "Mrpl39"
## [64] "Mrps6" "Mrpl18" "Mrps34" "Mrpl28" "Mrps18b" "Mrpl14" "Mrps18a"
## [71] "Mrpl2" "Mrps10" "Mrpl21" "Mrpl11" "Mrpl49" "Mrpl16" "Mrpl43"
data$percent.mt.ribo <-
PercentageFeatureSet(data, pattern = "^Mrp[sl]")
# hemoglobin genes
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
hb.genes
## [1] "Hbb-bt" "Hbb-bs" "Hbb-bh2" "Hbb-bh1" "Hbb-y" "Hba-x" "Hba-a1"
## [8] "Hba-a2"
data$percent.hb <-
PercentageFeatureSet(data, pattern = "^Hb[ba]-")
# percent Ttr in each cell
data$percent.Ttr <- PercentageFeatureSet(data, features = "Ttr")
# percent Apoe in each cell
data$percent.Apoe <- PercentageFeatureSet(data, features = "Apoe")
# Visualize the number of cell counts per sample
cell_count_data <- as.data.frame(table(data$sample))
colnames(cell_count_data) <- c("sample", "frequency")
ncellsRaw <-
ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_y_continuous(breaks = seq(0, 12000, by = 2000),
limits = c(0, 12000)) +
ggtitle("Raw: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ncellsRaw
# Visualize nCount_RNA
denCount <- ggplot(data@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
xlab("nCount_RNA") +
ylab("Density") +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
theme(legend.position = "none")
# Visualize nCount_RNA
denFeature <- ggplot(data@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min) +
geom_vline(xintercept = nFeature.max) +
theme(legend.position = "none")
# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity <- ggplot(data@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
theme(legend.position = "none")
# Arrange graphs in grid
plots <- list(denCount, denFeature, denCellComplexity)
layout <- rbind(c(1), c(2), c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize percent.mt
denMt <- ggplot(data@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.position = "none")
# Visualize percent.ribo
denRb <- ggplot(data@meta.data,
aes(color = sample,
x = percent.ribo,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("% Ribosomal Genes") +
ylab("Density") +
theme(legend.position = "none")
# Visualize percent.hb
denHb <- ggplot(data@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.position = "none")
# Arrange graphs in grid
plots <- list(denMt, denRb, denHb)
layout <- rbind(c(1), c(2), c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
### gene-level
# Visualize percent.Ttr
denTtr <- ggplot(data@meta.data,
aes(color = sample,
x = log2(percent.Ttr),
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("percent Ttr") +
ylab("Density") +
theme(legend.position = "none")
denTtr
## Warning: Removed 30402 rows containing non-finite values (stat_density).
# Visualize percent.Apoe
denApoe <- ggplot(data@meta.data,
aes(color = sample,
x = log2(percent.Apoe),
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("percent Apoe") +
ylab("Density") +
theme(legend.position = "none")
denApoe
## Warning: Removed 17557 rows containing non-finite values (stat_density).
# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(
data,
features = c("nFeature_RNA", "nCount_RNA", "cell.complexity"),
ncol = 3,
group.by = 'sample',
pt.size = 0
)
# nFeature_RNA
vFeature <- VlnPlot(
data,
features = c("nFeature_RNA"),
group.by = 'sample',
pt.size = 0
) + theme(legend.position = "none")
vFeature
# nCount_RNA
vCount <- VlnPlot(
data,
features = c("nCount_RNA"),
group.by = 'sample',
pt.size = 0
) + theme(legend.position = "none")
vCount
# cell.complexity
vCC <- VlnPlot(
data,
features = c("cell.complexity"),
group.by = 'sample',
pt.size = 0
) + theme(legend.position = "none")
vCC
# percent mt, ribo, hb
vMtRiboHb <- VlnPlot(
data,
features = c("percent.mt", "percent.ribo", "percent.hb"),
ncol = 3,
group.by = 'sample',
pt.size = 0
)
# percent mt
vMt <- VlnPlot(
data,
features = c("percent.mt"),
group.by = 'sample',
pt.size = 0.001
) + theme(legend.position = "none")
vMt
# percent ribosomal
vRb <- VlnPlot(
data,
features = c("percent.ribo"),
group.by = 'sample',
pt.size = 0.001
) + theme(legend.position = "none")
vRb
# percent hemoglobin genes
vHb <- VlnPlot(
data,
features = c("percent.hb"),
group.by = 'sample',
pt.size = 0.001
) + theme(legend.position = "none")
vHb
# Ttr
vTtr <- VlnPlot(
data,
features = c("percent.Ttr"),
group.by = 'sample',
pt.size = 0.001
) + theme(legend.position = "none")
vTtr
# Apoe
vApoe <- VlnPlot(
data,
features = c("percent.Apoe"),
group.by = 'sample',
pt.size = 0.001
) + theme(legend.position = "none")
vApoe
sMt <- ggplot(
data@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
geom_hline(yintercept = nFeature.min) +
geom_hline(yintercept = nFeature.max) +
facet_wrap(~sample, ncol = 3) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt
## `geom_smooth()` using formula 'y ~ x'
sMt <- FeatureScatter(data,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
shuffle = TRUE)
sMt
Filtering as defined by the manuscript
a cutoff of 300–9,000 UMI, 300–5,600 Gene was applied.
We will filter based on 6 conditions:
– nCount_RNA between 300 - 9,000 – nFeature_RNA between 300 - 5,600 – percent.mt < 5
# filter
data.filtered <- subset(
data,
subset = (nCount_RNA > nCount.min) &
(nCount_RNA < nCount.max) &
(nFeature_RNA > nFeature.min) &
(nFeature_RNA < nFeature.max) &
(percent.mt < mt.cutoff)
)
# print cells removed
print(paste0(dim(data)[2] - dim(data.filtered)[2], " cells removed"))
## [1] "8699 cells removed"
cell_count_data <- as.data.frame(table(data.filtered$sample))
colnames(cell_count_data) <- c("sample", "frequency")
ncellsFilter1 <-
ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position = position_dodge(width = 0.9), vjust = -0.25) +
scale_y_continuous(breaks = seq(0, 12000, by = 2000),
limits = c(0, 12000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ncellsFilter1
# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFilter1)
layout <- cbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.
# filter genes
counts <- GetAssayData(object = data.filtered, slot = "counts")
nonzero <- counts > 1 # produces logical
keep <- Matrix::rowSums(nonzero) >= 10 # sum the true/false
counts.filtered <- counts[keep,] # keep certain genes
# overwrite data.filtered
data.filtered <- CreateSeuratObject(counts.filtered,
meta.data = data.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "19995 features removed"
# which features (aka genes) were removed?
all.genes <- rownames(counts)
min.exp.genes <- rownames(counts.filtered)
genes.removed <- setdiff(all.genes, min.exp.genes)
https://github.com/chris-mcginnis-ucsf/DoubletFinder\
Heterotypic - doublets derived from transcriptionally distinct cells. DoubletFinder works best on this type of doublet.
Homotopic - Transcriptionally similar cell doublets. DoubletFinder does not work as great on this type of doublet.
pANN - proportion of artificial nearest neighbors (pANN)
BCMVN - mean-variance normalized bimodality coefficient of pANN distributions produced during pN -pK parameter sweeps. The BCMVN may be used to identify the pK parameter.
Overview of steps:
A. Prepare each sample
B. pK Identification (no ground-truth) - defines the PC neighborhood size used to compute pANN
C. Homotypic Doublet Proportion Estimate - homotypic doublets may not be a problem depending on the type of analysis you are performing. If you have some doublets of the same type and their counts are normalized, they will generally represent the profile of single cells of the same type.
D. DoubletFinder
E. Visualize where the doublets are located
save object
read in object
data.split <- readRDS(paste0("../../rObjects/", pathToTestType,
tolower(tissue),"_post_filter_split.rds"))
# converge data.split
data.singlets <- merge(
x = data.split[[1]],
y = c(
data.split[[2]],
data.split[[3]],
data.split[[4]],
data.split[[5]],
data.split[[6]],
data.split[[7]],
data.split[[8]]
),
project = paste0("Colonna data ", tissue, " Single Nucleus")
)
# print how many cells removed
print(paste0(dim(data.filtered)[2] - dim(data.singlets)[2], " cells removed"))
## [1] "838 cells removed"
# overwrite data.filtered
data.filtered <- data.singlets
# reset levels
data.filtered$genotype <- factor(data.filtered$genotype,
levels = c("WT", "Trem2_KO", "WT_5XFAD", "Trem2_KO_5XFAD"))
data.filtered$brain_region <- factor(data.filtered$brain_region,
levels = c(brain_region1, brain_region2))
data.filtered$sample <- factor(
data.filtered$sample,
levels = sample_order
)
# cleanup
remove(data.singlets,
data.split,
data_sample_singlets,
data_sample)
## Warning in remove(data.singlets, data.split, data_sample_singlets, data_sample):
## object 'data_sample_singlets' not found
## Warning in remove(data.singlets, data.split, data_sample_singlets, data_sample):
## object 'data_sample' not found
remove(n_cells,
n_cells_singlets,
ncell_matrix,
ncell_max,
ncells_per_cluster,
ncells_melt)
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'n_cells' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'n_cells_singlets' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncell_matrix' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncell_max' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncells_per_cluster' not found
## Warning in remove(n_cells, n_cells_singlets, ncell_matrix, ncell_max,
## ncells_per_cluster, : object 'ncells_melt' not found
remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, sum.stdv, stdv)
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sweep.res.list' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sweep.stats' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'bcmvn' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'bcmvn_max' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'difference' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'sum.stdv' not found
## Warning in remove(sweep.res.list, sweep.stats, bcmvn, bcmvn_max, difference, :
## object 'stdv' not found
remove(d1, d2, d3, f1, f2, b1)
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd1' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd2' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'd3' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'f1' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'f2' not found
## Warning in remove(d1, d2, d3, f1, f2, b1): object 'b1' not found
remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc)
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'co1' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'co2' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'cellmax' not found
## Warning in remove(counts, counts.filtered, nonzero, co1, co2, cellmax, min.pc):
## object 'min.pc' not found
# User params
goi <- "Ttr"
sample_oi <- "ALL"
threshold <- 2
# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(data.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)
# Histogram
title <-
paste0("Colonna Brain Cell: ",
goi,
" - ",
sample_oi,
"\nnCount_RNA > ",
threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
geom_histogram(bins = 100, fill = "gray", color = "black") +
labs(title = title, x = NULL, y = NULL) +
xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") +
theme_bw() +
geom_vline(xintercept = threshold, col = "blue") +
annotate(
"rect",
xmin = -Inf,
xmax = threshold,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "deepskyblue"
) +
annotate(
"rect",
xmin = threshold,
xmax = Inf,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "chocolate4"
)
# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
geom_histogram(bins = 100, fill = "gray", color = "black") +
labs(title = title, x = NULL, y = NULL) +
xlab(paste0("Log2(", goi, " nCount_RNA)")) + ylab("# of cells") +
theme_bw() +
geom_vline(xintercept = log2.threshold, col = "blue") +
annotate(
"rect",
xmin = -Inf,
xmax = log2.threshold,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "deepskyblue"
) +
annotate(
"rect",
xmin = log2.threshold,
xmax = Inf,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "chocolate4"
)
# plot
plots <- list(hist1, hist2)
layout <- rbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# number removed
table(counts.df$counts > threshold)
##
## FALSE TRUE
## 24879 8104
Malat1 all samples
# User params
goi <- "Malat1"
threshold <- 0.1
# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(data.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)
# Histogram
title <-
paste0("Colonna Brain Cell: ", goi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
geom_histogram(bins = 100, fill = "gray", color = "black") +
labs(title = title, x = NULL, y = NULL) +
xlab(paste0(goi, " nCount_RNA")) + ylab("# of Samples") +
theme_bw() +
geom_vline(xintercept = threshold, col = "blue") +
annotate(
"rect",
xmin = -Inf,
xmax = threshold,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "deepskyblue"
) +
annotate(
"rect",
xmin = threshold,
xmax = Inf,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "chocolate4"
)
# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
geom_histogram(bins = 100, fill = "gray", color = "black") +
labs(title = title, x = NULL, y = NULL) +
xlab(paste0("Log2(", goi, " nCount_RNA)")) + ylab("# of Samples") +
theme_bw() +
geom_vline(xintercept = log2.threshold, col = "blue") +
annotate(
"rect",
xmin = -Inf,
xmax = log2.threshold,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "deepskyblue"
) +
annotate(
"rect",
xmin = log2.threshold,
xmax = Inf,
ymin = 0,
ymax = Inf,
alpha = 0.2,
fill = "chocolate4"
)
# plot
plots <- list(hist1, hist2)
layout <- rbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# number removed
table(counts.df$counts > threshold)
##
## TRUE
## 32983
Ttr by sample
## [1] 1
## [1] 1
## [1] 2
## [1] 2
## [1] 3
## [1] 3
## [1] 4
## [1] 4
## [1] 5
## [1] 5
## [1] 6
## [1] 6
## [1] 7
## [1] 7
## [1] 8
## [1] 8
# Visualize the number of cell counts per sample
cell_count_data <- as.data.frame(table(data.filtered$sample))
colnames(cell_count_data) <- c("sample", "frequency")
ncellsFiltered <-
ggplot(cell_count_data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position = position_dodge(width = 0.9),
vjust = -0.25) +
scale_y_continuous(breaks = seq(0, 12000, by = 2000),
limits = c(0, 12000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFiltered)
layout <- cbind(c(1), c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize nCount_RNA
denCount_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
xlab("nCount_RNA") +
ylab("Density") +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
theme(legend.position = "none")
# Visualize nCount_RNA
denFeature_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min) +
geom_vline(xintercept = nFeature.max) +
theme(legend.position = "none")
# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
theme(legend.position = "none")
# Visualize percent.mt
denMt_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.position = "none")
# Arrange graphs in grid
plots <- list(
denCount,
denFeature,
denCellComplexity,
denCount_filter,
denFeature_filter,
denCellComplexity_filter
)
layout <- rbind(c(1, 4), c(2, 5), c(3, 6))
grid_den_raw_filter <-
grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize percent.mt
denMt_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.position = "none")
# Visualize percent.ribo
denRb_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = percent.ribo,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("% Ribosomal Genes") +
ylab("Density") +
theme(legend.position = "none")
# Visualize percent.hb
denHb_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.position = "none")
# Arrange graphs in grid
plots <- list(denMt, denRb, denHb, denMt_filter, denRb_filter, denHb_filter)
layout <- rbind(c(1, 4), c(2, 5), c(3, 6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
save
Density plots for genes of interest
# Visualize the number of counts per cell
denApoe_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = log2(percent.Apoe),
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
xlab("percent Apoe") +
ylab("Density") +
theme(legend.position = "none")
denApoe_filter
## Warning: Removed 14922 rows containing non-finite values (stat_density).
denTtr_filter <- ggplot(data.filtered@meta.data,
aes(color = sample,
x = log2(percent.Ttr),
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
xlab("percent Ttr") +
ylab("Density") +
theme(legend.position = "none")
denTtr_filter
## Warning: Removed 23827 rows containing non-finite values (stat_density).
# Arrange graphs in grid
plots <- list(denApoe, denTtr, denApoe_filter, denTtr_filter)
layout <- rbind(c(1,3), c(2,4))
grid_den_raw_filter_genes <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning: Removed 17557 rows containing non-finite values (stat_density).
## Warning: Removed 30402 rows containing non-finite values (stat_density).
## Warning: Removed 14922 rows containing non-finite values (stat_density).
## Warning: Removed 23827 rows containing non-finite values (stat_density).
# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(data.filtered,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
pt.size = 0)
vCellInfo
# percent Mt, ribo, hemo
vMtRiboHb <- VlnPlot(data.filtered,
features = c("percent.mt","percent.ribo","percent.hb"),
ncol = 3,
group.by = 'sample',
pt.size = 0.001)
vMtRiboHb
# percent Ttr
vTtr <- VlnPlot(data.filtered,
features = c("percent.Ttr"),
group.by = 'sample',
pt.size = 0)
vTtr
# percent Apoe
vApoe<- VlnPlot(data.filtered,
features = c("percent.Apoe"),
group.by = 'sample',
pt.size = 0)
vApoe
sMt <- ggplot(
data.filtered@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
geom_hline(yintercept = nFeature.min) +
geom_hline(yintercept = nFeature.max) +
facet_wrap(~sample, ncol = 4) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt
## `geom_smooth()` using formula 'y ~ x'
sMt <- FeatureScatter(data.filtered,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
shuffle = TRUE)
sMt
# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(data.filtered@meta.data,
aes(x = sample,
y = log10(nFeature_RNA),
fill=sample)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ggtitle("Unique Genes / Cell / Sample") +
xlab("Sample") +
theme(legend.position = "none")
b1
df <- data.frame(row.names = rownames(data.filtered))
df$rsum <- rowSums(x = data.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 20)
## rsum gene_name
## Malat1 16890260 Malat1
## Meg3 1377819 Meg3
## Snhg11 595771 Snhg11
## Rtn1 180953 Rtn1
## Ube3a 170010 Ube3a
## Gria2 128412 Gria2
## Apoe 107107 Apoe
## Acp1 96951 Acp1
## Rsrp1 95958 Rsrp1
## Cst3 84541 Cst3
## PISD 83844 PISD
## Atp1b1 78768 Atp1b1
## mt-Atp6 76932 mt-Atp6
## mt-Co3 76634 mt-Co3
## Grin2b 76361 Grin2b
## Ntm 72273 Ntm
## Itm2b 68363 Itm2b
## Pcsk2 65350 Pcsk2
## Tecr 62821 Tecr
## Ptgds 61343 Ptgds
For something to be informative, it needs to exhibit variation, but not all variation is informative. The goal of our clustering analysis is to keep the major sources of variation in our dataset that should define our cell types, while restricting the variation due to uninteresting sources of variation (sequencing depth, cell cycle differences, mitochondrial expression, batch effects, etc.). Then, to determine the cell types present, we will perform a clustering analysis using the most variable genes to define the major sources of variation in the dataset.
The most common biological data correction is to remove the effects of the cell cycle on the transcriptome. This data correction can be performed by a simple linear regression against a cell cycle score.
Check cell cycle phase BEFORE doing sctransform. Counts to need to be comparable between cells and each cell has a different number for nCount_RNA.
# summary of counts per cell
summary(data.filtered@meta.data$nCount_RNA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 310 816 1115 1266 1534 8303
Use the NormalizeData() function with the argument LogNormalze to account for sequencing depth. nCount_RNA for each gene is divided by the total nCount_RNA for that cell. This is done for all cells. This number is then multiplied by the scale.factor so we don’t have to deal with a tiny number. This number is then natural-log transformed using log1p. log1p is the natural logarithm (base e) of 1 + count. The 1 will prevent taking the log of 0.
data.phase <- NormalizeData(data.filtered,
scale.factor = 10000, # default
normalization.method = "LogNormalize" # default
)
Give each cell a score based on expression of G1, G2/M, and S phase markers. A list of markers is provided for humans. We will need to convert human gene names to mouse gene names. We will use the CellCycleScoring() function in seurat.
Below is a resource for acquiring cell markers for other organisms https://hbctraining.github.io/scRNA-seq_online/lessons/cell_cycle_scoring.html
G1 ~10 hrs S ~5-6 hrs G2 ~3-4 hrs M ~2 hrs
G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise the the greatest positive value between S.Score and G2M.Score determines the phase.
Using a human list for mouse https://github.com/satijalab/seurat/issues/2493
# Basic function to convert human to mouse gene names
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol",
values = x , mart = human, attributesL = c("mgi_symbol"),
martL = mouse, uniqueRows=T)
humanx <- unique(genesV2[, 2])
# Print the first 6 genes found to the screen
print(head(humanx))
return(humanx)
}
m.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
m.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
saveRDS(m.s.genes, "../../results/geneLists/m.s.genes.rds")
saveRDS(m.g2m.genes, "../../results/geneLists/m.g2m.genes.rds")
m.s.genes <- readRDS("../../results/geneLists/m.s.genes.rds")
m.g2m.genes <- readRDS("../../results/geneLists/m.g2m.genes.rds")
# score cells for cell cycle
data.phase <- CellCycleScoring(data.phase,
g2m.features = m.g2m.genes,
s.features = m.s.genes,
set.ident = TRUE)
## Warning: The following features are not present in the object: Mcm4, Rrm2,
## Uhrf1, Chaf1b, Exo1, Gmnn, Cdc45, Fen1, Cdc6, Cenpu, Dscc1, Hells, Rad51ap1,
## Tipin, Prim1, Clspn, Cdca7, Mcm6, Mcm5, Pola1, Usp1, Ung, Blm, Casp8ap2, Pcna,
## Polr1b, Rrm1, Mcm7, Rad51, Ccne2, E2f8, not searching for symbol synonyms
## Warning: The following features are not present in the object: Gtse1, Smc4,
## Dlgap5, Cdc25c, Kif20b, Cdca2, Cks2, Ttk, Cdk1, Cdca3, Ckap2, Hmgb2, Gas2l3,
## Tacc3, Cdc20, Birc5, Lbr, Anln, Cks1b, Kif2c, Hmmr, Cenpa, Kif23, Aurkb, Kif11,
## Tpx2, Ccnb2, Cdca8, Ndc80, Cenpe, Nek2, Ect2, Jpt1, Bub1, Pimreg, Nusap1,
## Ckap2l, Aurka, Ube2c, Nuf2, not searching for symbol synonyms
cellcyclecount_barplot <-
as_tibble(data.phase[[]]) %>%
ggplot(aes(Phase, fill = Phase)) + geom_bar()
cellcyclecount_barplot
percent.phase <- data.phase@meta.data %>%
group_by(sample, Phase) %>%
dplyr::count() %>%
group_by(sample) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x = sample, y = percent, fill = Phase)) +
geom_col() +
ggtitle("Percentage of phase per sample") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
percent.phase
# pie point
cellcyclecount_piepoint <-
as_tibble(data.phase[[]]) %>%
ggplot(aes(x=S.Score, y=G2M.Score, color=Phase)) +
geom_point()
cellcyclecount_piepoint
Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.
# Identify the most variable genes
data.phase <- FindVariableFeatures(data.phase,
selection.method = "vst", # default vst
nfeatures = 2000, # default 2000
verbose = FALSE)
# view top variable genes
top40 <- head(VariableFeatures(data.phase), 40)
top40
## [1] "Ttr" "Ptgds" "Vip" "Npy" "Sst" "Apoe" "Vtn" "Apod"
## [9] "Myoc" "Cldn5" "Cd74" "Mgp" "Acta2" "Ly6c1" "Spp1" "C1qa"
## [17] "Tac2" "Plp1" "Igfbp7" "Avp" "Cst3" "Igfbp2" "Trf" "Klk6"
## [25] "Ly6a" "Clu" "Atp1a2" "Hexb" "Nts" "Rgs5" "Crhbp" "Crip1"
## [33] "Myl9" "Trh" "Ctss" "Cort" "Flt1" "Trem2" "C1qc" "Tac1"
# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(data.phase, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot,
points = top40, repel = TRUE, fontface="italic",
xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel
# The variability information can be accessed using the HVFInfo method.
# The names of the variable features can be accessed with VariableFeatures().
variance.data <- as_tibble(HVFInfo(data.phase),rownames = "Gene")
variance.data <- variance.data %>% mutate(hypervariable=Gene %in% VariableFeatures(data.phase))
# We can plot out a graph of the variance vs mean and highlight the selected genes
# this way, we can see whether we think we’re likely to capture what we need.
subset_data <- subset(variance.data, hypervariable == TRUE)
varGeneslog <- variance.data %>%
ggplot(aes(log(mean),log(variance),color=hypervariable)) +
geom_point() +
scale_color_manual(values=c("black","red")) + geom_text_repel(
data = subset_data, max.overlaps = 20,
aes(
x = log(mean),
y = log(variance),
label = Gene,
fontface="italic",),segment.alpha = 1,size = 4) +
theme(legend.position="bottom")
varGeneslog
remove(subset_data)
See if the cell cycle is a major source of variation using PCA. We scale the data because highly expressed genes exhibit the highest amount of variation and we don’t want our ‘highly variable genes’ only to reflect high expression.
vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then, feature values are standardized using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
The ScaleData() function in Seurat will adjust gene expressions so that the mean expression in each cell is 0. It will also scale each gene to give a variance of 1 for each cell.
# Scale the counts
data.phase <- ScaleData(data.phase)
## Centering and scaling data matrix
data.phase@assays
## $RNA
## Assay data with 8003 features for 32983 cells
## Top 10 variable features:
## Ttr, Ptgds, Vip, Npy, Sst, Apoe, Vtn, Apod, Myoc, Cldn5
data.phase.pca <- RunPCA(data.phase, nfeatures.print = 10)
## PC_ 1
## Positive: Apoe, Atp1a2, Plpp3, Gpr37l1, Slc1a3, Cd63, Gja1, Mfge8, F3, Htra1
## Negative: Meg3, Snhg11, Opcml, Gm12027, R3hdm1, Fam19a1, Atp2b1, 2900055J20Rik, Epha5, Chgb
## PC_ 2
## Positive: Ctss, C1qa, C1qc, C1qb, Laptm5, Ctsh, Trem2, Fcrls, Hexb, Selplg
## Negative: Gpr37l1, Plpp3, Mfge8, Cldn10, Pla2g7, Gja1, F3, Slc7a10, Ntsr2, Slc1a3
## PC_ 3
## Positive: C1ql2, Zbtb20, C1ql3, Fam19a1, Ttr, Icam5, Galnt1, Atp2b1, Gm12027, Nell2
## Negative: Snhg11, Slc32a1, Meg3, Cit, Slc6a1, Resp18, Npy, Kcnc1, Nxph1, Sst
## PC_ 4
## Positive: Slc32a1, Synpr, C1ql2, Scg2, Ttr, Slc6a1, Spock3, Nxph1, Gria1, Pnoc
## Negative: R3hdm1, Ptn, Ptprd, Mef2c, Stx1a, Vtn, Ly6e, Cck, Ntm, Lmo4
## PC_ 5
## Positive: Vtn, Slc6a20a, Slc6a13, Pltp, Aebp1, Itm2a, Pcolce, Slc22a8, Cxcl12, Dcn
## Negative: Ctss, C1qa, C1qc, Selplg, C1qb, Laptm5, Fcrls, Hexb, Csf1r, Trem2
If the plots for each phase look very similar to each other, do not regress out variation due to cell cycle. You can plot PC1 vs PC2 before and after regression to see how effective it was. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
# Plot the PCA colored by cell cycle phase
cycle.pca.phase <- DimPlot(data.phase.pca,
reduction = "pca",
group.by= "Phase",
split.by = "Phase")
cycle.pca.phase
cycle.pca <- DimPlot(data.phase.pca,
reduction = "pca",
group.by = "Phase",
shuffle = TRUE)
cycle.pca
clean up seurat objects
remove(data_sample, data.filtered, data.filtered.split, data.phase.pca, cycle.pca.phase)
Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model. ??Variance is also adjusted based on pooling information across genes with similar abundances??
Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase and so we chose not to regress this out of our data. We observed some effect of mitochondrial expression and so we choose to regress this out from the data.
Since we have several samples in our dataset, we want to keep them as separate objects and transform them as that is what is required for integration. We will first split the cells in seurat.phase object by sample.
data.split <- SplitObject(data.phase, split.by = "sample")
# check
data.split
## $WT_Cor
## An object of class Seurat
## 8003 features across 9694 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $Trem2_KO_Cor
## An object of class Seurat
## 8003 features across 6158 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $WT_5XFAD_Cor
## An object of class Seurat
## 8003 features across 3985 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $Trem2_KO_5XFAD_Cor
## An object of class Seurat
## 8003 features across 2919 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $WT_Hip
## An object of class Seurat
## 8003 features across 2619 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $Trem2_KO_Hip
## An object of class Seurat
## 8003 features across 2241 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $WT_5XFAD_Hip
## An object of class Seurat
## 8003 features across 3339 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
##
## $Trem2_KO_5XFAD_Hip
## An object of class Seurat
## 8003 features across 2028 samples within 1 assay
## Active assay: RNA (8003 features, 2000 variable features)
Now we will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.
Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb) using the following code:
options(future.globals.maxSize = 4000 * 1024^5)
for (i in 1:length(data.split)) {
print(paste0("Sample ", i))
data.split[[i]] <- SCTransform(data.split[[i]],
verbose = FALSE
)
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"
NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument.
Note, the last line of output specifies “Set default assay to SCT”. We can view the different assays that we have stored in our seurat object.
A thread about whether or not regress out batch: https://github.com/satijalab/seurat/issues/3270 It is suggested to not regress out batch, and instead use a data integration method
# Check
data.split
## $WT_Cor
## An object of class Seurat
## 16003 features across 9694 samples within 2 assays
## Active assay: SCT (8000 features, 3000 variable features)
## 1 other assay present: RNA
##
## $Trem2_KO_Cor
## An object of class Seurat
## 15998 features across 6158 samples within 2 assays
## Active assay: SCT (7995 features, 3000 variable features)
## 1 other assay present: RNA
##
## $WT_5XFAD_Cor
## An object of class Seurat
## 15996 features across 3985 samples within 2 assays
## Active assay: SCT (7993 features, 3000 variable features)
## 1 other assay present: RNA
##
## $Trem2_KO_5XFAD_Cor
## An object of class Seurat
## 15965 features across 2919 samples within 2 assays
## Active assay: SCT (7962 features, 3000 variable features)
## 1 other assay present: RNA
##
## $WT_Hip
## An object of class Seurat
## 15952 features across 2619 samples within 2 assays
## Active assay: SCT (7949 features, 3000 variable features)
## 1 other assay present: RNA
##
## $Trem2_KO_Hip
## An object of class Seurat
## 15933 features across 2241 samples within 2 assays
## Active assay: SCT (7930 features, 3000 variable features)
## 1 other assay present: RNA
##
## $WT_5XFAD_Hip
## An object of class Seurat
## 15981 features across 3339 samples within 2 assays
## Active assay: SCT (7978 features, 3000 variable features)
## 1 other assay present: RNA
##
## $Trem2_KO_5XFAD_Hip
## An object of class Seurat
## 15914 features across 2028 samples within 2 assays
## Active assay: SCT (7911 features, 3000 variable features)
## 1 other assay present: RNA
Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.
To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.
Now, using our SCTransform object as input, let’s perform the integration across conditions.
First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.
# Choose the features to use when integrating multiple datasets.
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = data.split,
nfeatures = 3000)
# merge the data
data.sct.merged <- merge(x = data.split[[1]],
y = c(
data.split[[2]],
data.split[[3]],
data.split[[4]],
data.split[[5]],
data.split[[6]],
data.split[[7]],
data.split[[8]]
),
project = paste0("Colonna data ", tissue, " Single Nucleus")
)
# define the variable features
VariableFeatures(data.sct.merged) <- var.features
save rObject
insepct data
data.sct.merged <- RunPCA(data.sct.merged, assay = "SCT")
## PC_ 1
## Positive: Ube3a, Snhg11, Fam19a1, Pcsk2, Grin2b, Acp1, Gria2, Rtn1, R3hdm1, Cul4a
## PISD, Opcml, Gm12027, Syt1, Matk, Cck, Arpp21, Atp2b1, Celf2, Grin2a
## 2900055J20Rik, Mef2c, Epha5, Kalrn, Miat, Thy1, Rsrp1, Chgb, Ptprn, Csmd1
## Negative: Apoe, Cst3, Atp1a2, Clu, Plpp3, Cd81, Gpr37l1, Cd63, Ptgds, Slc1a3
## Mfge8, Cd9, Gja1, Apod, Slc1a2, Plp1, Htra1, Pla2g7, F3, Sparcl1
## Ntsr2, Cldn10, Bcan, Ptn, mt-Atp6, mt-Co3, Serpine2, S1pr1, mt-Co1, Ptprz1
## PC_ 2
## Positive: Apod, Plp1, Ptgds, Trf, Cldn11, C1qa, Mag, Hexb, Sst, C1qc
## Ctss, C1qb, Cd9, Mal, mt-Atp6, Npy, mt-Co1, mt-Co3, Vtn, Ctsd
## mt-Cytb, mt-Nd1, Resp18, Mog, Slc32a1, Bsg, Fth1, mt-Co2, Lgmn, Itm2a
## Negative: Plpp3, Clu, Gpr37l1, Mfge8, Apoe, Slc1a3, Gja1, Slc1a2, Cldn10, Ntsr2
## Pla2g7, F3, Htra1, Bcan, Atp1a2, Slc7a10, Btbd17, S1pr1, Fam19a1, Cst3
## Mlc1, Ntm, Aqp4, Gm3764, Sparcl1, Nkain4, Lcat, Slc6a11, Ptprz1, Phkg1
## PC_ 3
## Positive: Apod, Plp1, Ptgds, Fam19a1, Trf, Cldn11, Mag, Mal, Cd9, C1qa
## Pcsk2, Cul4a, Mog, Vtn, C1qc, Neat1, C1qb, Ctss, Baiap2, R3hdm1
## Enpp2, Hexb, Celf2, C1ql3, Acp1, Arpp21, Gsn, Gm12027, Kalrn, Apoe
## Negative: Sst, Npy, Resp18, Slc32a1, Atp1b1, Slc6a1, Synpr, Crhbp, Scg2, Snhg11
## Vip, Clu, Scg5, Atp6ap2, Nxph1, Pcsk1n, Kcnc1, Ly6h, Pnoc, Rbp4
## Serpini1, Stmn3, Rtn1, Dner, Sparcl1, Cacna2d2, Cxcl14, Podxl2, Psap, Ache
## PC_ 4
## Positive: Plp1, Trf, Cd9, Mag, Cldn11, Hexb, C1qa, C1qc, Cst3, Ctss
## Cd81, C1qb, Mal, Mog, Selplg, Ctsd, Pllp, Aplp1, Sox10, Lgmn
## Scd2, Laptm5, Stmn4, Fcrls, Apoe, Gpr37, Opalin, Csf1r, Tnfaip6, P2ry12
## Negative: Vtn, Atp1a2, Itm2a, Igfbp7, Pltp, Igfbp2, mt-Atp6, Ifitm3, mt-Co1, mt-Co3
## Cxcl12, mt-Cytb, Slc6a20a, Aebp1, Id3, Slc6a13, Slc7a11, Mgp, Dcn, Slc22a8
## Pcolce, Slco1a4, Vim, Tm4sf1, Itih5, Hspb1, Serping1, Klf2, Id1, mt-Nd4
## PC_ 5
## Positive: Plp1, Apod, Ptgds, Cldn11, Trf, Mag, Mal, Mog, Neat1, Apoe
## Pllp, Gsn, Sox10, Scd2, Enpp2, Ptn, Tnfaip6, Atp1a2, Opalin, Gpr37
## C4b, Hapln2, Stmn4, Igfbp2, Vtn, Sst, Aplp1, Aebp1, Pdgfra, Gjc2
## Negative: C1qa, Hexb, Cst3, Ctss, C1qc, C1qb, Selplg, Ctsd, Laptm5, Fcrls
## Csf1r, Fcer1g, P2ry12, Cx3cr1, Tyrobp, Tmem119, Cyba, Gpr34, Lgmn, Olfml3
## Vsir, Hmha1, Unc93b1, Cd68, P2ry13, Siglech, Ctsz, Sparc, Lag3, B2m
pcabrain_region <- DimPlot(data.sct.merged,
reduction = "pca",
group.by = "brain_region")
pcabrain_region
pcabrain_region_split <- DimPlot(data.sct.merged,
reduction = "pca",
split.by = "brain_region",
group.by = "brain_region")
pcabrain_region_split
pcagenotype <- DimPlot(data.sct.merged,
reduction = "pca",
group.by = "genotype")
pcagenotype
pcagenotype_split <- DimPlot(data.sct.merged,
reduction = "pca",
split.by = "genotype",
group.by = "genotype")
pcagenotype_split
pcasample <- DimPlot(data.sct.merged,
reduction = "pca",
split.by = "sample",
group.by = "sample")
pcasample
pcasample_split <- DimPlot(data.sct.merged,
reduction = "pca",
group.by = "sample",
shuffle = TRUE)
pcasample_split
# clean up
remove(pcabrain_region, pcabrain_region_split, pcagenotype, pcagenotype_split, pcasample, pcasample_split)
run harmony
# run harmony to harmonize over samples
data.integrated <- RunHarmony(object = data.sct.merged,
group.by.vars = "sample",
assay.use = "SCT",
reduction = "pca",
plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony converged after 8 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity
# first put the data back in place
DefaultAssay(data.integrated) <- "SCT"
Idents(data.integrated) <- "sample"
data.integrated$sample <- factor(data.integrated$sample,
levels = sample_order)
# check PCA
p1 <- DimPlot(object = data.integrated,
reduction = "harmony",
group.by = "sample",
shuffle = TRUE)
p1
p2 <- VlnPlot(object = data.integrated,
features = "harmony_1",
group.by = "brain_region",
pt.size = 0)
p3 <- VlnPlot(object = data.integrated,
features = "harmony_1",
group.by = "brain_region",
pt.size = 0) + NoLegend()
p3
# clean up
remove(p1, p2, p3)
Top 20 variable features
top20 <- data.integrated@assays$SCT@var.features[1:20]
top20
## [1] "Apoe" "Sst" "Vip" "Ptgds" "Npy" "Apod" "Plp1" "Vtn"
## [9] "Cst3" "Atp1a2" "Clu" "Tac2" "Cd9" "Ttr" "Hexb" "C1qa"
## [17] "Plpp3" "Tac1" "Trf" "Igfbp2"
After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.
To generate these visualizations with the harmony output, use reduction = “harmony”
# Plot PCA
pcabrain_region <- DimPlot(data.integrated,
reduction = "harmony",
group.by = "brain_region")
pcabrain_region
pcabrain_regionSplit <- DimPlot(data.integrated,
reduction = "harmony",
split.by = "brain_region",
group.by = "brain_region")
pcabrain_regionSplit
pcaGenotype <- DimPlot(data.integrated,
reduction = "harmony",
group.by = "genotype",
shuffle = TRUE)
pcaGenotype
pcaGenotypeSplit <- DimPlot(data.integrated,
reduction = "harmony",
split.by = "genotype",
group.by = "genotype",
shuffle = TRUE)
pcaGenotypeSplit
pcaSample <- DimPlot(data.integrated,
reduction = "harmony",
group.by = "sample",
shuffle = TRUE)
pcaSample
pcaSampleSplit <- DimPlot(data.integrated,
reduction = "harmony",
split.by = "sample",
group.by = "sample")
pcaSampleSplit
## png
## 2
## null device
## 1
## pdf
## 2
## null device
## 1
## pdf
## 2
## null device
## 1
To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
# Printing out the most variable genes driving PCs
print(x = data.integrated[["pca"]],
dims = 1:10,
nfeatures = 10)
## PC_ 1
## Positive: Ube3a, Snhg11, Fam19a1, Pcsk2, Grin2b, Acp1, Gria2, Rtn1, R3hdm1, Cul4a
## Negative: Apoe, Cst3, Atp1a2, Clu, Plpp3, Cd81, Gpr37l1, Cd63, Ptgds, Slc1a3
## PC_ 2
## Positive: Apod, Plp1, Ptgds, Trf, Cldn11, C1qa, Mag, Hexb, Sst, C1qc
## Negative: Plpp3, Clu, Gpr37l1, Mfge8, Apoe, Slc1a3, Gja1, Slc1a2, Cldn10, Ntsr2
## PC_ 3
## Positive: Apod, Plp1, Ptgds, Fam19a1, Trf, Cldn11, Mag, Mal, Cd9, C1qa
## Negative: Sst, Npy, Resp18, Slc32a1, Atp1b1, Slc6a1, Synpr, Crhbp, Scg2, Snhg11
## PC_ 4
## Positive: Plp1, Trf, Cd9, Mag, Cldn11, Hexb, C1qa, C1qc, Cst3, Ctss
## Negative: Vtn, Atp1a2, Itm2a, Igfbp7, Pltp, Igfbp2, mt-Atp6, Ifitm3, mt-Co1, mt-Co3
## PC_ 5
## Positive: Plp1, Apod, Ptgds, Cldn11, Trf, Mag, Mal, Mog, Neat1, Apoe
## Negative: C1qa, Hexb, Cst3, Ctss, C1qc, C1qb, Selplg, Ctsd, Laptm5, Fcrls
## PC_ 6
## Positive: Sst, Npy, Igfbp2, Atp1a2, Vtn, Apod, Ptgds, Slc7a11, Crhbp, Slc6a13
## Negative: mt-Atp6, mt-Co3, Atp2b1, mt-Co1, C1ql3, Gpm6a, Rtn1, Aplp1, mt-Nd1, Chgb
## PC_ 7
## Positive: Pdgfra, Serpine2, Matn4, Cd9, C1ql1, Ptprz1, Tmem176b, Sox10, Ptn, Gpr17
## Negative: Plp1, Trf, Cst3, Clu, Slc1a2, Cldn11, Mag, Gja1, Sst, Plpp3
## PC_ 8
## Positive: Ptgds, Atp1a2, Apod, Igfbp2, Vtn, Slc7a11, Dcn, Slc6a13, Aebp1, Rtn1
## Negative: Cd9, Sst, Sox10, Plp1, Npy, Matn4, mt-Co1, mt-Co3, mt-Atp6, Ptprz1
## PC_ 9
## Positive: Tac1, Penk, Synpr, Zbtb20, Sst, Scg2, Car12, Gpr88, Mhrt, Npy
## Negative: Brinp3, Snhg11, Tshz2, Ntm, Ptprd, Cck, R3hdm1, Rsrp1, Pde4d, Ube3a
## PC_ 10
## Positive: Sst, Npy, Crhbp, Rbp4, Ntm, Fam19a1, Cort, Chgb, C1ql3, Atp2b1
## Negative: Vip, Cxcl14, Tac2, Cnr1, Penk, Htr3a, Crh, Pnoc, Tac1, Cpne7
Quantitative approach to an elbow plot - The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation. - The point where the percent change in variation between the consecutive PCs is less than 0.1%.
First metric
# Determine percent of variation associated with each PC
stdv <- data.integrated[["pca"]]@stdev
sum.stdv <- sum(data.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)
# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 43
Second metric
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
(percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
## [1] 22
Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 22
Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
# Run UMAP
data.integrated <- RunUMAP(data.integrated,
dims = 1:min.pc,
reduction = "harmony",
n.components = 3) # set to 3 to use with VR
DimPlot(data.integrated,
group.by = "brain_region",
shuffle = TRUE)
DimPlot(data.integrated,
group.by = "brain_region",
split.by = "brain_region",
shuffle = TRUE)
DimPlot(data.integrated,
group.by = "genotype",
shuffle = TRUE)
DimPlot(data.integrated,
group.by = "genotype",
split.by = "genotype",
shuffle = TRUE)
DimPlot(data.integrated,
group.by = "sample",
shuffle = TRUE)
DimPlot(data.integrated,
group.by = "sample",
split.by = "sample",
shuffle = TRUE) +
NoLegend()
FeaturePlot(data.integrated,
features = "Apoe")
FeaturePlot(data.integrated,
features = "Ttr")
Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].
We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.
The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.
# Determine the K-nearest neighbor graph
data.unannotated <- FindNeighbors(object = data.integrated,
assay = "SCT", # set as default after SCTransform
reduction = "harmony",
dims = 1:min.pc)
# Determine the clusters for various resolutions
data.unannotated <- FindClusters(object = data.unannotated,
algorithm = 1, # 1= Louvain
resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9660
## Number of communities: 9
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9520
## Number of communities: 14
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9419
## Number of communities: 14
## Elapsed time: 5 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9321
## Number of communities: 15
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9225
## Number of communities: 15
## Elapsed time: 5 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9134
## Number of communities: 17
## Elapsed time: 5 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9063
## Number of communities: 18
## Elapsed time: 6 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 32983
## Number of edges: 1116677
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8994
## Number of communities: 20
## Elapsed time: 5 seconds
# 0.1
umap0.1 <- DimPlot(data.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE)
umap0.1
# 0.2
umap0.2 <- DimPlot(data.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
umap0.2
# 0.3
umap0.3 <- DimPlot(data.unannotated,
group.by = "SCT_snn_res.0.3",
label = TRUE)
umap0.3
# 0.4
umap0.4 <- DimPlot(data.unannotated,
group.by = "SCT_snn_res.0.4",
label = TRUE)
umap0.4
# brain_region
ubrain_region <- DimPlot(data.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "brain_region") +
NoLegend()
ubrain_region
# brain_region
ugenotype <- DimPlot(data.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "genotype") +
NoLegend()
ugenotype
# sample
usample <- DimPlot(data.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "sample") +
NoLegend()
usample
# phase
uphase <- DimPlot(data.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "Phase") +
NoLegend()
uphase
data.unannotated[["percent.neuron"]] <-
PercentageFeatureSet(data.unannotated, features = c("Snap25", "Syt1", "Gad1", "Gad2"))
data.unannotated[["percent.astrocytes"]] <-
PercentageFeatureSet(data.unannotated, features = c("Clu", "Gfap", "Aqp4"))
data.unannotated[["percent.microglia"]] <-
PercentageFeatureSet(data.unannotated, features = c("Hexb", "C1qa"))
data.unannotated[["percent.DAM"]] <-
PercentageFeatureSet(data.unannotated,
features = c("Apoe", "Cst7", "Lyz2", "Lpl", "Cd9", "Trem2"))
data.unannotated[["percent.homeostatic"]] <-
PercentageFeatureSet(data.unannotated, features = c("P2ry12", "Cx3cr1"))
data.unannotated[["percent.oligodendrocytes"]] <-
PercentageFeatureSet(data.unannotated, features = c("Plp1", "Mbp", "Mag"))
data.unannotated[["percent.OPCs"]] <-
PercentageFeatureSet(data.unannotated, features = c("Olig1", "Pdgfra", "Vcan"))
FeaturePlot(data.unannotated, features = "percent.neuron")
FeaturePlot(data.unannotated, features = "percent.astrocytes")
FeaturePlot(data.unannotated, features = "percent.microglia")
FeaturePlot(data.unannotated, features = "percent.DAM")
FeaturePlot(data.unannotated, features = "percent.homeostatic")
FeaturePlot(data.unannotated, features = "percent.oligodendrocytes")
FeaturePlot(data.unannotated, features = "percent.OPCs")
## Revist QC metrics
# nCount
fnCount <- FeaturePlot(data.unannotated, features = "nCount_RNA", order = TRUE)
fnCount
# nFeature
fnFeature <- FeaturePlot(data.unannotated, features = "nFeature_RNA", order = TRUE)
fnFeature
# percent.mt
fpercent.mt <- FeaturePlot(data.unannotated, features = "percent.mt", order = TRUE)
fpercent.mt
# cell.complexity
fcell.complexity <- FeaturePlot(data.unannotated, features = "cell.complexity", order = TRUE)
fcell.complexity
fS.Score <- FeaturePlot(data.unannotated, features = "S.Score", order = TRUE)
fS.Score
fG2M.Score <- FeaturePlot(data.unannotated, features = "G2M.Score", order = TRUE)
fG2M.Score
data.unannotated@meta.data$seurat_clusters <-
data.unannotated@meta.data$SCT_snn_res.0.3
# brain_region
bbrain_region <- data.unannotated@meta.data %>%
group_by(seurat_clusters, brain_region) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=brain_region)) +
geom_col() +
ggtitle("Percentage of brain_region per cluster")
bbrain_region
# genotype
bgenotype <- data.unannotated@meta.data %>%
group_by(seurat_clusters, genotype) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=genotype)) +
geom_col() +
ggtitle("Percentage of genotype per cluster")
bgenotype
# sample
bsample <- data.unannotated@meta.data %>%
group_by(seurat_clusters, sample) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
geom_col() +
ggtitle("Percentage of sample per cluster")
bsample
genotype_ncells <- FetchData(data.unannotated,
vars = c("ident", "genotype")) %>%
dplyr::count(ident, genotype) %>%
tidyr::spread(ident, n)
write.table(genotype_ncells,
paste0("../../results/", pathToTestType, "nCells/",
tolower(tissue),
"_cells_per_cluster_genotype.txt"),
quote = FALSE, sep = "\t")
sample_ncells <- FetchData(data.unannotated,
vars = c("ident", "sample")) %>%
dplyr::count(ident,sample) %>%
tidyr::spread(ident, n)
write.table(sample_ncells,
paste0("../../results/", pathToTestType, "nCells/",
tolower(tissue),
"_cells_per_cluster_sample.txt"),
quote = FALSE, sep = "\t")