knitr::opts_knit$set(
root.dir = ".")
library(dplyr)
library(dittoSeq)
library(ggrepel)
library(ggtree)
library(parallel)
library(plotly) # 3D plot
library(Seurat) # Idents()
library(SeuratDisk) # SaveH5Seurat()
library(tibble) # rownnames_to_column
library(variancePartition)
options(mc.cores = detectCores() - 1)
# set levels and idents
pigs.unannotated$individual_clusters <-
factor(pigs.unannotated$SCT_snn_res.0.1,
levels = c("0","1","2","3","4","5","6","7","8","9","10","11", "12"))
pigs.unannotated$treatment <- factor(pigs.unannotated$treatment,
levels = c("LPS","Saline"))
Idents(pigs.unannotated) <- "individual_clusters"
# normalize if SCTtransform was used
DefaultAssay(pigs.unannotated) <- "RNA"
# natural-log
pigs.unannotated <- NormalizeData(pigs.unannotated, verbose = FALSE)
# create treatment column
batch <- gsub("4.R.Saline", "batch1", pigs.unannotated$sample)
batch <- gsub("10.Saline", "batch2", batch)
batch <- gsub("8.R.LPS", "batch1", batch)
batch <- gsub("12.LPS", "batch2", batch)
pigs.unannotated$batch <- factor(batch, levels = c("batch1","batch2"))
table(pigs.unannotated$batch)
##
## batch1 batch2
## 1762 7172
# Set colors
colors <- c("dodgerblue","yellow","firebrick1","green","gray40","lightgray",
"cyan","chocolate4","pink","orange","darkgreen","purple",
"deeppink","blue","gold","navyblue")
# Plot the UMAP
DimPlot(object = pigs.unannotated,
reduction = "umap",
label = TRUE,
#label.box = TRUE,
label.size = 3,
repel = TRUE,
group.by = "individual_clusters",
cols = colors
)
# Split by sample
DimPlot(object = pigs.unannotated,
reduction = "umap",
label = TRUE,
label.size = 3,
split.by = "sample",
repel = TRUE,
group.by = "individual_clusters",
cols = colors)
# Split by treatment
DimPlot(object = pigs.unannotated,
reduction = "umap",
label = TRUE,
label.size = 3,
split.by = "treatment",
repel = TRUE,
group.by = "individual_clusters",
cols = colors)
dittoDimPlot(object = pigs.unannotated,
var = "individual_clusters",
reduction.use = "umap",
do.label = TRUE,
labels.highlight = TRUE)
# umap percent.mt
FeaturePlot(pigs.unannotated,
reduction = "umap",
features = "CEMIP",
min.cutoff = 'q10',
label = TRUE)
# umap percent.mt split by sample
FeaturePlot(pigs.unannotated,
reduction = "umap",
features = "percent.mt",
split.by = "sample",
min.cutoff = 'q10',
label = TRUE)
dittoRidgePlot(object = pigs.unannotated,
var = "CXCL2",
group.by = "individual_clusters",
split.by = "treatment")
# treatment
pigs.unannotated@meta.data %>%
group_by(individual_clusters, treatment) %>%
dplyr::count() %>%
group_by(individual_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=individual_clusters,y=percent, fill=treatment)) +
geom_col() +
scale_fill_manual(values = c("purple","gray")) +
ggtitle("Percentage of treatment per cluster")
# sample
pigs.unannotated@meta.data %>%
group_by(individual_clusters, sample) %>%
dplyr::count() %>%
group_by(individual_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=individual_clusters,y=percent, fill=sample)) +
geom_col() +
#scale_fill_manual(values = sample_colors) +
ggtitle("Percentage of sample per cluster")
n_cells <- FetchData(pigs.unannotated,
vars = c("ident", "treatment")) %>%
dplyr::count(ident,treatment) %>%
tidyr::spread(ident, n)
n_cells
## treatment 0 1 2 3 4 5 6 7 8 9 10 11 12
## 1 LPS 2774 1070 383 375 341 373 443 138 194 118 147 72 1
## 2 Saline 487 403 368 336 252 204 38 174 24 78 10 31 100
Remove outliers with log-library size greater than 3 median absolute deviations (MADs) or below the median log-library size.
# MAD example
log.lib <- as.numeric(log10(pigs.unannotated$nCount_RNA))
med <- median(log.lib)
abs.dev <- abs(log.lib - med)
mad <- median(abs.dev)
mad <- mad * 1.4826 # multiply by consistency cutoff
mad
## [1] 0.4023281
# stats function
mad <- mad(log.lib)
mad
## [1] 0.4023281
# remove outliers greater than 3 MADs
remove <- abs(log.lib - median(log.lib)) / mad(log.lib) > 3
table(remove)
## remove
## FALSE
## 8934
pigs.unannotated <- BuildClusterTree(object = pigs.unannotated,
dims = 1:15,
reorder = FALSE,
reorder.numeric = FALSE)
tree <- pigs.unannotated@tools$BuildClusterTree
tree$tip.label <- paste0("Cluster ", tree$tip.label)
tree_1 <- ggtree::ggtree(tree, aes(x, y)) +
scale_y_reverse() +
ggtree::geom_tree() +
ggtree::theme_tree() +
ggtree::geom_tiplab(offset = 1) +
ggtree::geom_tippoint(color = colors[1:length(tree$tip.label)],
shape = 16, size = 5) +
coord_cartesian(clip = 'off') +
theme(plot.margin = unit(c(0,2.5,0,0), 'cm'))
info <- pigs.unannotated@meta.data
expression <- GetAssayData(object = pigs.unannotated, slot = "counts")
form <- ~ (1|treatment) + (1|sample) + (1|batch) + percent.mt
varPart <- fitExtractVarPartModel(expression, form, info)
## Dividing work into 100 chunks...
##
## Total:6654 s
vp <- sortCols(varPart)
vp_violin <- plotVarPart(vp)
vp_violin
vp_bar <- plotPercentBars(vp[1:10,])
vp_bar
## Top variable genes per variable
varPart.df <- as.data.frame(vp_df)
# sort genes based on variance explained by treatment
vp <- vp[order(vp$treatment, decreasing = TRUE),]
head(vp["treatment"], 10)
# sort genes based on variance explained by sample
vp <- vp[order(vp$sample, decreasing = TRUE),]
head(vp["sample"], 10)
# sort genes based on variance explained by percent mito
vp <- vp[order(vp$percent.mt, decreasing = TRUE),]
head(vp["percent.mt"], 10)
# sort genes based on variance explained by batch
vp <- vp[order(vp$batch, decreasing = TRUE),]
head(vp["batch"], 10)
# Find markers for each cluster
# Default is logfc.threshold = 0.25 and min.pct = 0.1
all.markers <- FindAllMarkers(object = pigs.unannotated,
test.use = "MAST")
# add column
all.markers$delta_pct <- abs(all.markers$pct.1 - all.markers$pct.2)
# rename columns and rows
rownames(all.markers) <- 1:nrow(all.markers)
all.markers <- all.markers[,c(6,7,1,5,2:4,8)]
colnames(all.markers)[c(6,7)] <- c("pct_1","pct_2")
# save
saveRDS(all.markers, "../../rObjects/unannotated_all_markers.rds")
# read object
all.markers <- readRDS("../../rObjects/unannotated_all_markers.rds")
# more stringent filtering
all.markers <- all.markers[all.markers$p_val_adj < 0.01,]
markers.strict <- all.markers[
all.markers$delta_pct > summary(all.markers$delta_pct)[5],]
# compare
table(all.markers$cluster)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 3248 3118 3443 3400 4647 1997 2494 3479 2453 1583 1976 384 2529
table(markers.strict$cluster)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 452 748 692 738 945 321 638 797 888 190 684 172 1383
# subset
cluster0 <- markers.strict[markers.strict$cluster == 0,]
cluster1 <- markers.strict[markers.strict$cluster == 1,]
cluster2 <- markers.strict[markers.strict$cluster == 2,]
cluster3 <- markers.strict[markers.strict$cluster == 3,]
cluster4 <- markers.strict[markers.strict$cluster == 4,]
cluster5 <- markers.strict[markers.strict$cluster == 5,]
cluster6 <- markers.strict[markers.strict$cluster == 6,]
cluster7 <- markers.strict[markers.strict$cluster == 7,]
cluster8 <- markers.strict[markers.strict$cluster == 8,]
cluster9 <- markers.strict[markers.strict$cluster == 9,]
cluster10 <- markers.strict[markers.strict$cluster == 10,]
cluster11 <- markers.strict[markers.strict$cluster == 11,]
cluster12 <- markers.strict[markers.strict$cluster == 12,]
FindConservedMarkers() will first separate cells by treatment and then perform differential gene expression testing for a single specified cluster against all other clusters.
There are three important arguments which provide thresholds for determining if a gene is a marker.
** logfc.threshold ** min.diff.pct ** min.pct
Output:
** gene: gene symbol ** condition_p_val: p-value not adjusted for multiple test correction for condition ** condition_avg_logFC: average log fold change for condition. Positive values indicate that the gene is more highly expressed in the cluster. ** condition_pct.1: percentage of cells where the gene is detected in the cluster for condition ** condition_pct.2: percentage of cells where the gene is detected on average in the other clusters for condition ** condition_p_val_adj: adjusted p-value for condition, based on bonferroni correction using all genes in the dataset, used to determine significance ** max_pval: largest p value of p value calculated by each group/condition ** minimump_p_val: combined p value
When looking at the output, we suggest looking for markers with large differences in expression between pct.1 and pct.2 and larger fold changes. For instance if pct.1 = 0.90 and pct.2 = 0.80, it may not be as exciting of a marker. However, if pct.2 = 0.1 instead, the bigger difference would be more convincing.
Also, of interest is if the majority of cells expressing the marker is in your cluster of interest. If pct.1 is low, such as 0.3, it may not be as interesting. Both of these are also possible parameters to include when running the function.
# initialize df
conserved.markers <- data.frame()
all.clusters <- levels(pigs.unannotated$SCT_snn_res.0.1)
# loop through each cluster
for (i in all.clusters) {
# print the cluster you're on
print(i)
# find conserved marker in cluster vs all other clusters
markers <- FindConservedMarkers(pigs.unannotated,
ident.1 = i, # subset to single cluster
grouping.var = "treatment", # compare by treatment
only.pos = TRUE, # default
min.pct = 0.1, # default
logfc.threshold = 0.25, # default
test.use = "MAST"
)
# skip if none
if(nrow(markers) == 0) {
next
}
# make rownames a column
markers <- rownames_to_column(markers, var = "gene")
# make cluster number a column
markers$cluster <- i
# add to final table
conserved.markers <- rbind(conserved.markers, markers)
}
# create delta_pct
conserved.markers$LPS_delta_pct <- abs(conserved.markers$LPS_pct.1 -
conserved.markers$LPS_pct.2)
conserved.markers$Saline_delta_pct <- abs(conserved.markers$Saline_pct.1 -
conserved.markers$Saline_pct.2)
# more stringent filtering
markers.strict <- conserved.markers[
conserved.markers$LPS_delta_pct > summary(conserved.markers$LPS_delta_pct)[5],]
markers.strict <- conserved.markers[
conserved.markers$Saline_delta_pct > summary(conserved.markers$Saline_delta_pct)[5],]
markers.strict$gene_name <- markers.strict$gene
markers.strict$row.num <- 1:nrow(markers.strict)
# compare
table(as.numeric(conserved.markers$cluster))
table(as.numeric(markers.strict$cluster))
# save (takes a while to run)
saveRDS(conserved.markers, "../../rObjects/brain_conserved_markers.rds")
saveRDS(markers.strict, "../../rObjects/brain_conserved_markers_strict.rds")
conserved.markers <- readRDS("../../rObjects/brain_conserved_markers.rds")
markers.strict <- readRDS("../../rObjects/brain_conserved_markers.rds")
# Printing out the most variable genes driving PCs
print(x = pigs.unannotated[["pca"]],
dims = 1:10,
nfeatures = 10)
## PC_ 1
## Positive: SLC1A2, PREX2, NOL11, NHSL1, SLC1A3, EYA2, IRAG1, POLR3B, LRIG1, ZBTB20
## Negative: KCNIP4, RBFOX1, DPP10, ROBO2, OPCML, NRXN3, TENM2, LRRTM4, GALNTL6, HS6ST3
## PC_ 2
## Positive: SLC1A2, PREX2, NOL11, NHSL1, LSAMP, CADM2, IRAG1, POLR3B, CTNND2, NTM
## Negative: ENSSSCG00000017146, MX2, ENSSSCG00000033089, GAB2, RBMS3, HERC6, EPSTI1, BNC2, ARHGAP24, LRMDA
## PC_ 3
## Positive: CEMIP, BNC2, RBMS3, ADAM12, SVIL, EYA1, COLEC12, GPC6, USP53, BMP6
## Negative: RNF220, C10orf90, ST18, MBP, OPALIN, PLP1, PLEKHH1, ENSSSCG00000049542, DOCK10, ZNF536
## PC_ 4
## Positive: CALCR, INPP5D, ARHGAP24, ARHGAP15, DOCK8, GAB2, PDE3B, LCP2, RUNX1, VAV1
## Negative: RBMS3, CEMIP, BNC2, ADAM12, EYA1, COLEC12, FBXL7, BMP6, SVIL, BICC1
## PC_ 5
## Positive: KCNIP4, NKAIN2, TAFA1, ENSSSCG00000011729, RNF220, C10orf90, MBP, SLIT3, PLEKHH1, OPALIN
## Negative: LHFPL3, NXPH1, SOX6, TNR, MMP16, NELL1, ITGA9, KCNIP1, AGAP1, THSD7B
## PC_ 6
## Positive: LHFPL3, DSCAM, TNR, XYLT1, AGAP1, MMP16, KCNB2, ITGA9, STK32A, KCNIP4
## Negative: ERBB4, NXPH1, GALNTL6, PTCHD4, KIAA1217, ZNF536, BTBD11, ZNF804B, KCNC2, GAD2
## PC_ 7
## Positive: SLC16A9, GPC6, FOXP1, NTN1, SLC26A2, SLC47A1, KIAA1755, BNC2, ZFHX3, ADAMTS9
## Negative: CEMIP, MX2, ENSSSCG00000033089, EPSTI1, RSAD2, ENSSSCG00000017146, COLEC12, HERC6, ARHGAP28, PARP14
## PC_ 8
## Positive: ENSSSCG00000033089, MX2, RSAD2, ENSSSCG00000017146, RBMS3, ROR2, PARP14, C3, PLEKHG1, ADAMTS9
## Negative: TRPM3, GPC5, CEMIP, ATP1A4, ENSSSCG00000044580, SLC6A20, KCNK2, ADAM12, PRKAG2, MAPK4
## PC_ 9
## Positive: CA10, TAFA1, ENC1, RFX3, GRIA4, RASGRF2, CCK, SLIT2, FSTL5, ENSSSCG00000014097
## Negative: HS3ST4, DPP10, GRIK3, IL1RAPL2, KIAA1217, SEMA3E, ZFPM2, CLSTN2, SLC35F4, RASGRP1
## PC_ 10
## Positive: NHSL1, EYA2, SLC1A2, EYA1, EFEMP1, PREX2, GPC5, RBFOX1, ENSSSCG00000051274, ROBO2
## Negative: HS3ST4, DPP10, ESRRG, GRM8, WWC1, GALNT9, GPM6A, SEMA3E, KIRREL3, SEZ6L
# print top variable genes
top100 <- pigs.unannotated@assays$SCT@var.features[1:100]
top100
## [1] "SLC1A2" "TNR" "SLC16A9"
## [4] "LHFPL3" "NOL11" "ENSSSCG00000049542"
## [7] "CALCR" "MBP" "RNF220"
## [10] "ENSSSCG00000044602" "XYLT1" "ST18"
## [13] "ARHGAP24" "CEMIP" "GALNTL6"
## [16] "C10orf90" "RBFOX1" "KCNIP4"
## [19] "ARHGAP15" "ANK1" "DPP10"
## [22] "OPALIN" "NRG1" "SLC26A2"
## [25] "PREX2" "NELL1" "LUZP2"
## [28] "C3" "ROBO2" "IDO2"
## [31] "NXPH1" "NRXN3" "NTM"
## [34] "IL1RAPL2" "OPCML" "TRPM3"
## [37] "BNC2" "CLSTN2" "COLEC12"
## [40] "INPP5D" "TCF7L2" "BMP6"
## [43] "LRMDA" "RUNX1" "MX2"
## [46] "MMP16" "EBF1" "KIRREL3"
## [49] "FLI1" "PDE3B" "LSAMP"
## [52] "GLI3" "NHSL1" "ZNF536"
## [55] "DLGAP2" "ABLIM1" "PLAT"
## [58] "BCAS1" "DOCK8" "SLC6A20"
## [61] "GPC5" "AGMO" "PLEKHH1"
## [64] "DOCK10" "ENSSSCG00000045193" "ADAM12"
## [67] "ENSSSCG00000051274" "ENSSSCG00000004830" "RBPMS"
## [70] "ENSSSCG00000017146" "ITGA9" "GPR39"
## [73] "ENSSSCG00000051497" "TENM2" "HS6ST3"
## [76] "RN18S" "CREB5" "ASIC2"
## [79] "ARHGAP28" "DOCK2" "SVIL"
## [82] "TMCC3" "SLC1A3" "PRR5L"
## [85] "FOXP1" "THSD7B" "USP53"
## [88] "ENSSSCG00000024973" "TAFA1" "TNC"
## [91] "FHOD3" "EYA2" "KIAA1217"
## [94] "KCTD16" "ABR" "ARHGAP10"
## [97] "NFASC" "FRMD5" "HMCN2"
## [100] "TNIK"