https://github.com/satijalab/seurat/issues/1621\ https://github.com/satijalab/seurat/issues/2340\ https://github.com/satijalab/seurat/issues/1883\
knitr::opts_knit$set(
root.dir = ".")
library(dplyr)
library(dittoSeq)
library(ggrepel)
library(ggtree)
library(parallel)
library(plotly) # 3D plot
library(Seurat) # Idents()
library(SeuratDisk) # SaveH5Seurat()
library(tibble) # rownnames_to_column
library(harmony) # RunHarmony()
#options(mc.cores = detectCores() - 1)
library(gtools)
pigs.merged <- readRDS("../../rObjects/brain_annotated.rds")
Idents(pigs.merged) <- pigs.merged$merged_clusters
DefaultAssay(pigs.merged) <- "RNA"
# natural-log
pigs.merged <- NormalizeData(pigs.merged, verbose = FALSE)
pigs.merged
## An object of class Seurat
## 42056 features across 7277 samples within 2 assays
## Active assay: RNA (21295 features, 0 variable features)
## 1 other assay present: SCT
## 3 dimensional reductions calculated: pca, harmony, umap
# UMAP
u1 <- DimPlot(object = pigs.merged,
reduction = "umap",
label = TRUE,
label.box = TRUE,
label.size = 4,
repel = TRUE)
u1
# subset
neuron <- subset(pigs.merged, merged_clusters == "Neuron")
# UMAP of neuron only
neuron_colors <- c("gold")
u1 <- DimPlot(object = neuron,
reduction = "umap",
label = TRUE,
label.box = TRUE,
label.size = 4,
repel = TRUE,
cols = neuron_colors)
u1
# show neuron cells by treatment
u2 <- DimPlot(object = neuron,
reduction = "umap",
label = TRUE,
label.box = TRUE,
label.size = 4,
repel = TRUE,
group.by = "treatment",
cols = treatment_colors)
u2
## png
## 2
## null device
## 1
## pdf
## 2
## null device
## 1
# Visualize the number of cell counts per sample
data <- as.data.frame(table(neuron$sample))
colnames(data) <- c("sample","frequency")
ncells <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
# scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
ggtitle("Cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells
## Top transcripts
df <- data.frame(row.names = rownames(neuron))
df$rsum <- rowSums(x = neuron, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 10)
## rsum gene_name
## KK-MALAT1 190976 KK-MALAT1
## NRXN3 133968 NRXN3
## RBFOX1 107400 RBFOX1
## KCNIP4 102889 KCNIP4
## OPCML 93144 OPCML
## CADM2 90168 CADM2
## DLG2 86279 DLG2
## DPP10 85054 DPP10
## PCDH9 82466 PCDH9
## NRG3 80680 NRG3
# Identify the most variable genes
neuron <- FindVariableFeatures(neuron,
selection.method = "vst", # default vst
nfeatures = 2000, # default 2000
verbose = FALSE)
# view top variable genes
top40 <- head(VariableFeatures(neuron), 40)
top40
## [1] "RN18S" "EYA1" "GRIK1"
## [4] "NXPH1" "RELN" "SOX6"
## [7] "ADARB2" "SST" "ERBB4"
## [10] "FOXP2" "NELL1" "ZNF536"
## [13] "ENSSSCG00000044834" "TMEM132C" "PDE1C"
## [16] "IL1RAPL2" "CDH13" "ENSSSCG00000004511"
## [19] "HS3ST4" "PTPRM" "GRIN3A"
## [22] "PTCHD4" "SLC35F4" "ANK1"
## [25] "THSD7B" "LUZP2" "ENSSSCG00000008533"
## [28] "PLD5" "SLC39A8" "SDK1"
## [31] "TRHDE" "ZNF385D" "ZFPM2"
## [34] "HTR2C" "CNR1" "BTBD11"
## [37] "KIAA1217" "SORCS1" "SEMA3E"
## [40] "KCNC2"
# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(neuron, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot,
points = top40, repel = TRUE, fontface="italic",
xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel
## Median absolute deviation
Remove outliers with log-library size greater than 3 median absolute deviations (MADs) or below the median log-library size.
# MAD example
log.lib <- as.numeric(log10(neuron$nCount_RNA))
med <- median(log.lib)
abs.dev <- abs(log.lib - med)
mad <- median(abs.dev)
mad <- mad * 1.4826 # multiply by consistency cutoff
mad
## [1] 0.4298133
# stats function
mad <- mad(log.lib)
mad
## [1] 0.4298133
# remove outliers greater than 3 MADs
remove <- abs(log.lib - median(log.lib)) / mad(log.lib) > 3
table(remove)
## remove
## FALSE
## 3708
# split object by sample
Idents(neuron) <- neuron$sample
neuron.split <- SplitObject(neuron, split.by = "sample")
# SCTransform and regress percent.mt
neuron.split <- lapply(neuron.split, SCTransform, vars.to.regress = "percent.mt")
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 9679 by 124
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 124 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## There are 1 estimated thetas smaller than 1e-07 - will be set to 1e-07
## Found 21 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 9679 genes
##
|
| | 0%
|
|==== | 5%
|
|======= | 10%
|
|========== | 15%
|
|============== | 20%
|
|================== | 25%
|
|===================== | 30%
|
|======================== | 35%
|
|============================ | 40%
|
|================================ | 45%
|
|=================================== | 50%
|
|====================================== | 55%
|
|========================================== | 60%
|
|============================================== | 65%
|
|================================================= | 70%
|
|==================================================== | 75%
|
|======================================================== | 80%
|
|============================================================ | 85%
|
|=============================================================== | 90%
|
|================================================================== | 95%
|
|======================================================================| 100%
## Computing corrected count matrix for 9679 genes
##
|
| | 0%
|
|==== | 5%
|
|======= | 10%
|
|========== | 15%
|
|============== | 20%
|
|================== | 25%
|
|===================== | 30%
|
|======================== | 35%
|
|============================ | 40%
|
|================================ | 45%
|
|=================================== | 50%
|
|====================================== | 55%
|
|========================================== | 60%
|
|============================================== | 65%
|
|================================================= | 70%
|
|==================================================== | 75%
|
|======================================================== | 80%
|
|============================================================ | 85%
|
|=============================================================== | 90%
|
|================================================================== | 95%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 6.260121 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt
## Centering data matrix
## Set default assay to SCT
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 19494 by 3584
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 3584 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 78 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 19494 genes
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 13%
|
|=========== | 15%
|
|============= | 18%
|
|============== | 21%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 28%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 79%
|
|========================================================= | 82%
|
|=========================================================== | 85%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 19494 genes
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 13%
|
|=========== | 15%
|
|============= | 18%
|
|============== | 21%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 28%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 79%
|
|========================================================= | 82%
|
|=========================================================== | 85%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 1.694648 mins
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt
## Centering data matrix
## Set default assay to SCT
# Choose the features to use when integrating multiple datasets.
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = neuron.split,
nfeatures = 3000)
# Merge the split object
neuron.sct.merged <- merge(x = neuron.split[[1]],
y = c(neuron.split[[2]]),
project = "LPS Pigs Neurons")
# Define the variable features
VariableFeatures(neuron.sct.merged) <- var.features
# Run PCA on the merged object
neuron.sct.merged <- RunPCA(object = neuron.sct.merged, assay = "SCT")
## PC_ 1
## Positive: TAFA1, KCNIP4, ARPP21, NRG1, LDB2, PDE1A, ST6GALNAC5, PTPRK, SLIT3, ENSSSCG00000011729
## GRM7, CUX2, PCSK2, DSCAM, GPC6, SV2B, RYR2, HS3ST4, ENSSSCG00000049709, DLG1
## ENSSSCG00000042224, RALYL, ENSSSCG00000014097, HS6ST3, KCNB2, KCNQ5, ENSSSCG00000035525, NELL2, CRACDL, GALNT17
## Negative: ERBB4, NXPH1, PTPRM, SOX6, ZNF536, KIAA1217, GALNTL6, KCNC2, ZNF385D, PTCHD4
## TMEM132C, BTBD11, INPP4B, ZNF804A, GAD2, SLC6A1, ZNF804B, ENSSSCG00000044043, SDK1, GRIP1
## ANK1, NHS, DNER, UBASH3B, NRXN3, ENSSSCG00000008533, GAD1, ENSSSCG00000004511, KCNIP1, LSAMP
## PC_ 2
## Positive: HS3ST4, ZFPM2, DPP10, FOXP2, GRIK3, IL1RAPL2, KIAA1217, SEMA3E, LRP1B, CLSTN2
## SGCZ, SLC35F4, SORCS1, GALNT9, GRM3, TSPAN18, MCTP1, GRM8, RASGRP1, LSAMP
## PRKCB, KHDRBS3, TOX, FMNL2, XYLT1, RORA, TLE4, DCC, SLIT1, BCAR3
## Negative: CA10, RFX3, NPAS3, TAFA1, CALM1, GRIA4, ENC1, RASGRF2, SLIT2, HPCAL4
## FSTL5, ENSSSCG00000014097, HS6ST3, CCK, EPHA6, CNTN3, KCND2, UNC5D, ERBB4, CSMD3
## KCTD16, CUX2, CACNB2, STPG4, GALNT13, PDLIM5, STXBP6, SLIT3, PTPRK, GABRA2
## PC_ 3
## Positive: SLC1A2, ZBTB20, PREX2, NOL11, SLC1A3, NHSL1, LRMDA, GLIS3, ENSSSCG00000044913, EYA2
## STAT1, ACSS3, TCF7L2, WWTR1, ENSSSCG00000017146, IRAG1, PARD3, PLEKHA7, CLIC4, GLI3
## LRIG1, FBXL7, ITGAV, CDC14A, POLR3B, EPS8, EGFR, FGFR2, BNC2, SVIL
## Negative: NXPH1, KCNC2, SOX6, TMEM132C, PTCHD4, KIAA1217, RBFOX1, IL1RAPL1, NRXN3, SPOCK3
## GABRG3, GRIA4, HS6ST3, ENSSSCG00000004511, ANK1, CDH9, ROBO1, TRHDE, SGCZ, GRIP1
## DLGAP1, INPP4B, CNTNAP5, ZNF385D, KCNQ3, SLIT2, ENSSSCG00000011729, CNTN4, FGF12, SLC24A2
## PC_ 4
## Positive: ERBB4, ADARB2, ZNF536, ENSSSCG00000050221, HTR2C, SGCZ, SORCS1, SORCS3, KIT, RSPO2
## ZMAT4, KLHL13, SDK1, CNTN5, ENSSSCG00000051441, GPR158, SLC35F4, ASIC2, MACROD2, GALNTL6
## KCNG2, LUZP2, AMOTL1, PTPRO, NWD2, KAZN, CDH20, BTBD11, ZNF804A, NR3C2
## Negative: NXPH1, SOX6, PDE1C, KIAA1217, CACNA2D3, TRHDE, GRIK1, GRIN3A, GRIK3, CDH13
## TMCC3, UTRN, PARD3, SPOCK3, CDH9, BRINP3, ROBO2, SATB1, ROR1, EYA1
## NPAS3, SYNPR, KCNC2, THSD7B, ENSSSCG00000029771, RELN, RBMS1, ENSSSCG00000044043, ENSSSCG00000048562, COL19A1
## PC_ 5
## Positive: GALNTL6, GRIK1, THSD7B, RELN, PDE1C, NELL1, TRHDE, CACNA2D3, GRIN3A, NRXN3
## RSPO2, ROBO2, PLCB4, ADARB2, UTRN, PTPRT, EYA1, ENSSSCG00000023627, GRIK2, SLC24A3
## TMCC3, BRINP3, DOCK10, FSTL5, CCDC85A, SYTL5, ENSSSCG00000050221, SORCS1, RIN2, GRID2
## Negative: ENSSSCG00000004511, VAV3, TMEM132C, ZNF385D, IL1RAPL1, ZNF804A, PTCHD4, IQGAP2, BTBD11, KCNC2
## ENSSSCG00000038066, PTPRM, DOCK4, KIAA1217, SDK1, GRIA4, ERBB4, SLIT2, HMCN1, ENSSSCG00000043464
## ROBO1, ANK1, PLCXD3, HTR4, ENSSSCG00000051588, NXPH1, ETV6, GABRG3, CNTNAP5, SEMA3E
# get significant PCs
stdv <- neuron.sct.merged[["pca"]]@stdev
sum.stdv <- sum(neuron.sct.merged[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
## [1] 16
# run umap
neuron.nointergration <- RunUMAP(neuron.sct.merged, dims = 1:min.pc, reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 15:41:13 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:41:13 Read 3708 rows and found 16 numeric columns
## 15:41:13 Using Annoy for neighbor search, n_neighbors = 30
## 15:41:13 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:41:14 Writing NN index file to temp file /tmp/RtmprlGalq/file4074e302e465c
## 15:41:14 Searching Annoy index using 1 thread, search_k = 3000
## 15:41:15 Annoy recall = 100%
## 15:41:16 Commencing smooth kNN distance calibration using 1 thread
## 15:41:18 Initializing from normalized Laplacian + noise
## 15:41:18 Commencing optimization for 500 epochs, with 157952 positive edges
## 15:41:30 Optimization finished
# cluster
neuron.nointergration <- FindNeighbors(object = neuron.nointergration, dims = 1:min.pc)
## Computing nearest neighbor graph
## Computing SNN
# Determine the clusters for various resolutions
neuron.nointergration <- FindClusters(object = neuron.nointergration,resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9518
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9277
## Number of communities: 10
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9106
## Number of communities: 11
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8951
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8803
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8667
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8570
## Number of communities: 15
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8475
## Number of communities: 16
## Elapsed time: 0 seconds
Idents(neuron.nointergration) <- "SCT_snn_res.0.1"
neuron.nointergration$seurat_clusters <- neuron.nointergration$SCT_snn_res.0.1
u1 <- dittoDimPlot(object = neuron.nointergration,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
labels.highlight = FALSE)
u1
u2 <- dittoDimPlot(object = neuron.nointergration,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
split.by = "treatment",
labels.highlight = FALSE)
u2
# show neuron cells by treatment
u3 <- DimPlot(object = neuron.nointergration,
reduction = "umap",
label = FALSE,
repel = TRUE,
group.by = "treatment",
cols = treatment_colors)
u3
# show neuron cells by treatment
remove(u1,u2,u3)
# Cells per treatment per cluster
treatment_ncells <- FetchData(neuron.nointergration,
vars = c("ident", "treatment")) %>%
dplyr::count(ident,treatment) %>%
tidyr::spread(ident, n)
treatment_ncells
## treatment 0 1 2 3 4 5 6 7 8
## 1 LPS 1653 796 274 245 200 178 138 72 28
## 2 Saline 83 13 NA 4 1 5 NA 18 NA
# Cells per sample per cluster
sample_ncells <- FetchData(neuron.nointergration,
vars = c("ident", "sample")) %>%
dplyr::count(ident,sample) %>%
tidyr::spread(ident, n)
sample_ncells
## sample 0 1 2 3 4 5 6 7 8
## 1 12.LPS 1653 796 274 245 200 178 138 72 28
## 2 4.R.Saline 83 13 NA 4 1 5 NA 18 NA
# treatment
b1 <- neuron.nointergration@meta.data %>%
group_by(seurat_clusters, treatment) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=treatment)) +
geom_col() +
scale_fill_manual(values = treatment_colors) +
ggtitle("Percentage of treatment per cluster")
b1
# sample
b2 <- neuron.nointergration@meta.data %>%
group_by(seurat_clusters, sample) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
geom_col() +
#scale_fill_manual(values = sample_colors) +
ggtitle("Percentage of sample per cluster")
b2
remove(b1, b2)
# Run harmony to harmonize over samples
neuron.integrated <- RunHarmony(object = neuron.sct.merged,
group.by.vars = "sample",
assay.use = "SCT",
reduction = "pca",
plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony converged after 2 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity
First metric
# Determine percent of variation associated with each PC
stdv <- neuron.integrated[["pca"]]@stdev
sum.stdv <- sum(neuron.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)
# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 42
Second metric
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
(percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
## [1] 16
Usually, we would choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 16
Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
# Run UMAP
neuron.integrated <- RunUMAP(neuron.integrated,
dims = 1:min.pc,
reduction = "pca",
n.components = 3) # set to 3 to use with VR
## 15:41:44 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:41:44 Read 3708 rows and found 16 numeric columns
## 15:41:44 Using Annoy for neighbor search, n_neighbors = 30
## 15:41:44 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:41:44 Writing NN index file to temp file /tmp/RtmprlGalq/file4074e13bfb133
## 15:41:44 Searching Annoy index using 1 thread, search_k = 3000
## 15:41:45 Annoy recall = 100%
## 15:41:46 Commencing smooth kNN distance calibration using 1 thread
## 15:41:48 Initializing from normalized Laplacian + noise
## 15:41:48 Commencing optimization for 500 epochs, with 157952 positive edges
## 15:42:01 Optimization finished
# Determine the K-nearest neighbor graph
neuron.integrated <- FindNeighbors(object = neuron.integrated,dims = 1:min.pc)
## Computing nearest neighbor graph
## Computing SNN
# Determine the clusters for various resolutions
neuron.unannotated <- FindClusters(object = neuron.integrated,resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9518
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9277
## Number of communities: 10
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9106
## Number of communities: 11
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8951
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8803
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8667
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8570
## Number of communities: 15
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3708
## Number of edges: 123929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8475
## Number of communities: 16
## Elapsed time: 0 seconds
Idents(neuron.unannotated) <- "SCT_snn_res.0.1"
# save
saveRDS(neuron.unannotated,"../../rObjects/neuron_unannotated.rds")
DefaultAssay(neuron.unannotated) <- "RNA"
#neuron.unannotated <- NormalizeData(neuron.unannotated, verbose = FALSE)
neuron.unannotated$seurat_clusters <- neuron.unannotated$SCT_snn_res.0.1
u1 <- dittoDimPlot(object = neuron.unannotated,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
labels.highlight = FALSE)
u1
u2 <- dittoDimPlot(object = neuron.unannotated,
var = "seurat_clusters",
reduction.use = "umap",
do.label = TRUE,
split.by = "treatment",
labels.highlight = FALSE)
u2
# show neuron cells by treatment
u3 <- DimPlot(object = neuron.unannotated,
reduction = "umap",
label = FALSE,
repel = TRUE,
group.by = "treatment",
cols = treatment_colors)
u3
remove(u1,u2,u3)
cluster_colors <- c("gold","firebrick1","dodgerblue","green",
"cyan","chocolate4","gray40","purple", "blue")
neuron.unannotated <- BuildClusterTree(object = neuron.unannotated,
dims = 1:min.pc,
reorder = FALSE,
reorder.numeric = FALSE)
tree <- neuron.unannotated@tools$BuildClusterTree
tree$tip.label <- paste0("Cluster ", tree$tip.label)
p <- ggtree::ggtree(tree, aes(x, y)) +
scale_y_reverse() +
ggtree::geom_tree() +
ggtree::theme_tree() +
ggtree::geom_tiplab(offset = 1) +
ggtree::geom_tippoint(color = cluster_colors[1:length(tree$tip.label)], shape = 16, size = 5) +
coord_cartesian(clip = 'off') +
theme(plot.margin = unit(c(0,2.5,0,0), 'cm'))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
p
# Cells per treatment per cluster
treatment_ncells <- FetchData(neuron.unannotated,
vars = c("ident", "treatment")) %>%
dplyr::count(ident,treatment) %>%
tidyr::spread(ident, n)
treatment_ncells
## treatment 0 1 2 3 4 5 6 7 8
## 1 LPS 1653 796 274 245 200 178 138 72 28
## 2 Saline 83 13 NA 4 1 5 NA 18 NA
# Cells per sample per cluster
sample_ncells <- FetchData(neuron.unannotated,
vars = c("ident", "sample")) %>%
dplyr::count(ident,sample) %>%
tidyr::spread(ident, n)
sample_ncells
## sample 0 1 2 3 4 5 6 7 8
## 1 12.LPS 1653 796 274 245 200 178 138 72 28
## 2 4.R.Saline 83 13 NA 4 1 5 NA 18 NA
# treatment
b1 <- neuron.unannotated@meta.data %>%
group_by(seurat_clusters, treatment) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=treatment)) +
geom_col() +
scale_fill_manual(values = treatment_colors) +
ggtitle("Percentage of treatment per cluster")
b1
# sample
b2 <- neuron.unannotated@meta.data %>%
group_by(seurat_clusters, sample) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
geom_col() +
#scale_fill_manual(values = sample_colors) +
ggtitle("Percentage of sample per cluster")
b2
remove(b1, b2)
# set levels
neuron.unannotated$treatment <- factor(neuron.unannotated$treatment,
levels = c("LPS","Saline"))
neuron.unannotated$SCT_snn_res.0.1 <- factor(neuron.unannotated$SCT_snn_res.0.1)
# initialize df
conserved.markers <- data.frame()
all.clusters <- unique(neuron.unannotated$SCT_snn_res.0.1)
# loop through each cluster
for (i in all.clusters) {
# print the cluster you're on
print(i)
# find conserved marker in cluster vs all other clusters
markers <- FindConservedMarkers(neuron.unannotated,
ident.1 = i, # subset to single cluster
grouping.var = "treatment", # compare by treatment
only.pos = TRUE, # only positive markers
min.pct = 0.1, # default
logfc.threshold = 0.25, # default
test.use = "MAST"
)
# skip if none
if(nrow(markers) == 0) {
next
}
# make rownames a column
markers <- rownames_to_column(markers, var = "gene")
# make cluster number a column
markers$cluster <- i
# add to final table
conserved.markers <- smartbind(conserved.markers, markers)
}
## [1] "1"
## Testing group Saline: (1) vs (5, 3, 7, 0, 4)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Testing group LPS: (1) vs (3, 4, 2, 6, 0, 5, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## [1] "5"
## Testing group Saline: (5) vs (1, 3, 7, 0, 4)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Testing group LPS: (5) vs (3, 4, 2, 1, 6, 0, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## [1] "3"
## Testing group Saline: (3) vs (1, 5, 7, 0, 4)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Testing group LPS: (3) vs (4, 2, 1, 6, 0, 5, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## [1] "7"
## Testing group Saline: (7) vs (1, 5, 3, 0, 4)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Testing group LPS: (7) vs (3, 4, 2, 1, 6, 0, 5, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## [1] "0"
## Testing group Saline: (0) vs (1, 5, 3, 7, 4)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Testing group LPS: (0) vs (3, 4, 2, 1, 6, 5, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## [1] "4"
## Warning: Saline has fewer than 3 cells in Identity: 4. Skipping Saline
## Testing group LPS: (4) vs (3, 2, 1, 6, 0, 5, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning: Only a single group was tested
## [1] "2"
## Warning: Identity: 2 not present in group Saline. Skipping Saline
## Testing group LPS: (2) vs (3, 4, 1, 6, 0, 5, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning: Only a single group was tested
## [1] "6"
## Warning: Identity: 6 not present in group Saline. Skipping Saline
## Testing group LPS: (6) vs (3, 4, 2, 1, 0, 5, 7, 8)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning: Only a single group was tested
## [1] "8"
## Warning: Identity: 8 not present in group Saline. Skipping Saline
## Testing group LPS: (8) vs (3, 4, 2, 1, 6, 0, 5, 7)
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning: Only a single group was tested
# create delta_pct
conserved.markers$Saline_delta_pct <- abs(conserved.markers$Saline_pct.1 -
conserved.markers$Saline_pct.2)
conserved.markers$LPS_delta_pct <- abs(conserved.markers$LPS_pct.1 -
conserved.markers$LPS_pct.2)
# more stringent filtering
markers.strict <- conserved.markers[
conserved.markers$Saline_delta_pct > summary(conserved.markers$Saline_delta_pct)[5],]
markers.strict <- conserved.markers[
conserved.markers$LPS_delta_pct > summary(conserved.markers$LPS_delta_pct)[5],]
markers.strict$gene_name <- markers.strict$gene
markers.strict$row.num <- 1:nrow(markers.strict)
# compare
table(conserved.markers$cluster)
##
## 0 1 2 3 4 5 6 7 8
## 73 259 519 523 664 394 751 640 2351
table(markers.strict$cluster)
##
## 0 1 2 3 4 5 6 7 8
## 3 142 268 229 216 120 330 31 205
saveRDS(conserved.markers, paste0("../../rObjects/brain_conserved_markers_", cell, ".rds"))
#conserved.markers <- readRDS(paste0("../../rObjects/brain_conserved_markers_", cell, ".rds"))
# create delta_pct
conserved.markers$Saline_delta_pct <- abs(conserved.markers$Saline_pct.1 -
conserved.markers$Saline_pct.2)
conserved.markers$LPS_delta_pct <- abs(conserved.markers$LPS_pct.1 -
conserved.markers$LPS_pct.2)
# more stringent filtering
markers.strict <- conserved.markers[
conserved.markers$Saline_delta_pct > summary(conserved.markers$Saline_delta_pct)[5],]
markers.strict <- conserved.markers[
conserved.markers$LPS_delta_pct > summary(conserved.markers$LPS_delta_pct)[5],]
markers.strict$gene_name <- markers.strict$gene
markers.strict$row.num <- 1:nrow(markers.strict)
# subset
cluster0 <- markers.strict[markers.strict$cluster == 0,]
cluster1 <- markers.strict[markers.strict$cluster == 1,]
cluster2 <- markers.strict[markers.strict$cluster == 2,]
cluster3 <- markers.strict[markers.strict$cluster == 3,]
cluster4 <- markers.strict[markers.strict$cluster == 4,]
cluster5 <- markers.strict[markers.strict$cluster == 5,]
cluster6 <- markers.strict[markers.strict$cluster == 6,]
cluster7 <- markers.strict[markers.strict$cluster == 7,]
cluster8 <- markers.strict[markers.strict$cluster == 8,]
VlnPlot(neuron.unannotated,
features = cluster0$gene[1:2],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
### Cluster 1
VlnPlot(neuron.unannotated,
features = cluster1$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster1$gene[21:40],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
### Cluster 2
VlnPlot(neuron.unannotated,
features = cluster2$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster2$gene[21:41],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster3$gene[1:21],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster3$gene[21:40],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
### Cluster 4
VlnPlot(neuron.unannotated,
features = cluster4$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster4$gene[21:40],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
### Cluster 5
VlnPlot(neuron.unannotated,
features = cluster5$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster5$gene[21:40],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster6$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster6$gene[21:40],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster7$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster7$gene[21:29],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
### Cluster 8
VlnPlot(neuron.unannotated,
features = cluster8$gene[1:20],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
VlnPlot(neuron.unannotated,
features = cluster8$gene[21:40],
split.by = "seurat_clusters",
cols = cluster_colors,
flip = TRUE,
stack = TRUE)
# Neuron markers
# Interneuron
# umap with annotations
cell_types <- unique(neuron.unannotated$SCT_snn_res.0.1)
neuron.unannotated$cell_type <- neuron.unannotated$SCT_snn_res.0.1
DE.df <- data.frame()
for (i in cell_types) {
print(i)
cluster <- subset(neuron.unannotated, cell_type == i)
Idents(cluster) <- cluster$treatment
markers <- FindMarkers(object = cluster,
ident.1 = "LPS",
ident.2 = "Saline",
only.pos = FALSE, # default
min.pct = 0.10, # default
test.use = "MAST",
verbose = TRUE,
assay = "RNA")
if(nrow(markers) == 0) {
next
}
markers$cluster <- i
DE.df <- smartbind(DE.df, markers)
}
# change column names
colnames(DE.df)[c(3,4)] <- c("percent_LPS","percent_Saline")
# add gene name column
DE.df$gene <- rownames(DE.df)
# change row names
rownames(DE.df) <- 1:nrow(DE.df)
# calculate percent difference
DE.df$percent_difference <- abs(DE.df$percent_LPS - DE.df$percent_Saline)
# reorder columns
DE.df <- DE.df[,c(6,7,1,5,2,3,4,8)]
# write table
write.table(DE.df, "../../results/recluster/neuron/LPS_vs_Saline_DEGs.tsv", sep = "\t",
quote = FALSE, row.names = FALSE)
metadata <- neuron.unannotated@meta.data
neuron.unannotated@meta.data <- metadata
neuron.unannotated@assays$SCT@meta.features <- metadata
neuron.unannotated@assays$RNA@meta.features <- metadata
# load libraries
library(ShinyCell)
# make shiny folder
DefaultAssay(neuron.unannotated) <- "RNA"
Idents(neuron.unannotated) <- "merged_clusters"
sc.config <- createConfig(neuron.unannotated)
makeShinyApp(neuron.unannotated, sc.config, gene.mapping = TRUE,
shiny.title = "recluster neuron - two good pigs")