We will then analyze an scATAC-seq data of adult mouse brain. You can download the data to be loaded into Seurat/Signac from given directory”C:/STAT646/Rcode/HW3_data”. Process the scATAC-seq, compute the gene activity matrix, and then perform cell-type label transfer from scRNA-seq (allen brain.rds) to scATAC-seq and provide visualization. Note that you will need to use the EnsDb.Mmusculus.v79 annotation package for the mouse mm10 genome.
First load in Signac, Seurat, and some other packages we will be using for analyzing human data.
if (!requireNamespace("EnsDb.Mmusculus.v79", quietly = TRUE))
BiocManager::install("EnsDb.Mmusculus.v79")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("biovizBase")
## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.2 (2023-10-31 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'biovizBase'
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.3.2/library
## packages:
## boot, cluster, codetools, foreign, lattice, MASS, Matrix, mgcv, nlme,
## rpart, survival
## Old packages: 'matrixStats', 'reticulate', 'uwot', 'zlibbioc'
remotes::install_github("stuart-lab/signac", ref="develop")
devtools::install_github('immunogenomics/presto')
library(Signac)
library(Seurat)
## Warning: package 'Seurat' was built under R version 4.3.3
library(EnsDb.Mmusculus.v79)
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
## Warning: package 'GenomicFeatures' was built under R version 4.3.3
library(ggplot2)
library(patchwork)
library(presto)
## Warning: package 'data.table' was built under R version 4.3.3
library(biovizBase)
counts <- Read10X_h5(filename = "C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5")
metadata <- read.csv(
file = "C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_singlecell.csv",
header = TRUE,
row.names = 1
)
chrom_assay <- CreateChromatinAssay(
counts = counts,
sep = c(":", "-"),
fragments = 'C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz',
min.cells = 10,
min.features = 200
)
pbmc <- CreateSeuratObject(
counts = chrom_assay,
assay = "peaks",
meta.data = metadata
)
pbmc
## An object of class Seurat
## 154639 features across 4609 samples within 1 assay
## Active assay: peaks (154639 features, 0 variable features)
## 2 layers present: counts, data
fragpath <- 'C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz'
# Define cells
# If you already have a list of cell barcodes to use you can skip this step
total_counts <- CountFragments(fragpath)
cutoff <- 1000 # Change this number depending on your dataset!
barcodes <- total_counts[total_counts$frequency_count > cutoff, ]$CB
# Create a fragment object
frags <- CreateFragmentObject(path = fragpath, cells = barcodes)
pbmc[['peaks']]
## ChromatinAssay data with 154639 features for 4609 cells
## Variable features: 0
## Genome:
## Annotation present: FALSE
## Motifs present: FALSE
## Fragment files: 1
granges(pbmc)
## GRanges object with 154639 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 3094708-3095552 *
## [2] chr1 3119414-3121782 *
## [3] chr1 3204809-3205178 *
## [4] chr1 3217330-3217359 *
## [5] chr1 3228123-3228463 *
## ... ... ... ...
## [154635] chrY 90761049-90762169 *
## [154636] chrY 90800417-90800831 *
## [154637] chrY 90804515-90805440 *
## [154638] chrY 90808545-90809111 *
## [154639] chrY 90810741-90810960 *
## -------
## seqinfo: 21 sequences from an unspecified genome; no seqlengths
# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
# change to UCSC style since the data was mapped to mm10
seqlevels(annotations) <- paste0('chr', seqlevels(annotations))
genome(annotations) <- "mm10"
# add the gene information to the object
Annotation(pbmc) <- annotations
# compute nucleosome signal score per cell
pbmc <- NucleosomeSignal(object = pbmc)
# compute TSS enrichment score per cell
pbmc <- TSSEnrichment(object = pbmc, fast = FALSE)
# add blacklist ratio and fraction of reads in peaks
pbmc$pct_reads_in_peaks <- pbmc$peak_region_fragments / pbmc$passed_filters * 100
pbmc$blacklist_ratio <- pbmc$blacklist_region_fragments / pbmc$peak_region_fragments
DensityScatter(pbmc, x = 'nCount_peaks', y = 'TSS.enrichment', log_x = TRUE, quantiles = TRUE)
pbmc$high.tss <- ifelse(pbmc$TSS.enrichment > 3.4, 'High', 'Low')
TSSPlot(pbmc, group.by = 'high.tss') + NoLegend()
We can plot the distribution of each QC metric separately using a violin plot:
VlnPlot(
object = pbmc,
features = c('nCount_peaks', 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal', 'pct_reads_in_peaks'),
pt.size = 0.1,
ncol = 5
)
Finally we remove cells that are outliers for these QC metrics.
pbmc <- subset(
x = pbmc,
subset = nCount_peaks > 1000 &
nCount_peaks < 900000 &
pct_reads_in_peaks > 25 &
blacklist_ratio < 0.05 &
nucleosome_signal < 3 &
TSS.enrichment > 3
)
pbmc
## An object of class Seurat
## 154639 features across 4078 samples within 1 assay
## Active assay: peaks (154639 features, 0 variable features)
## 2 layers present: counts, data
pbmc <- RunTFIDF(pbmc)
pbmc <- FindTopFeatures(pbmc, min.cutoff = 'q0')
pbmc <- RunSVD(pbmc)
The first LSI component often captures sequencing depth (technical
variation) rather than biological variation. If this is the case, the
component should be removed from downstream analysis. We can assess the
correlation between each LSI component and sequencing depth using the
DepthCor() function:
DepthCor(pbmc)
Here we see there is a very strong correlation between the first LSI component and the total number of counts for the cell. We will perform downstream steps without this component as we don’t want to group cells together based on their total sequencing depth, but rather by their patterns of accessibility at cell-type-specific peaks.
Now that the cells are embedded in a low-dimensional space we can use
methods commonly applied for the analysis of scRNA-seq data to perform
graph-based clustering and non-linear dimension reduction for
visualization. The functions RunUMAP(),
FindNeighbors(), and FindClusters() all come
from the Seurat package.
pbmc <- RunUMAP(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindNeighbors(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindClusters(object = pbmc, verbose = FALSE, algorithm = 3)
DimPlot(object = pbmc, label = TRUE) + NoLegend()
To create a gene activity matrix, we extract gene coordinates and
extend them to include the 2 kb upstream region (as promoter
accessibility is often correlated with gene expression). We then count
the number of fragments for each cell that map to each of these regions,
using the using the FeatureMatrix() function. These steps
are automatically performed by the GeneActivity()
function:
gene.activities <- GeneActivity(pbmc)
# add the gene activity matrix to the Seurat object as a new assay and normalize it
pbmc[['RNA']] <- CreateAssayObject(counts = gene.activities)
pbmc <- NormalizeData(
object = pbmc,
assay = 'RNA',
normalization.method = 'LogNormalize',
scale.factor = median(pbmc$nCount_RNA)
)
Now we can visualize the activities of canonical marker genes to help interpret our ATAC-seq clusters.
DefaultAssay(pbmc) <- 'RNA'
#Apoe, Ptgds, Apod, Itm2a, Ly6a, Ly6c1, Vtn, Plp1,
#Pltp, Myl9
FeaturePlot(
object = pbmc,
features = c('Gse1', 'Tcf4', 'Hivep3', 'Phactr1', 'Arid1b', 'Trio'),
pt.size = 0.1,
max.cutoff = 'q95',
ncol = 3
)
# Load the pre-processed scRNA-seq data for PBMCs
pbmc_rna <- readRDS("C:/STAT646/Rcode/HW3_data/allen_brain.rds")
pbmc_rna <- UpdateSeuratObject(pbmc_rna)
transfer.anchors <- FindTransferAnchors(
reference = pbmc_rna,
query = pbmc,
reduction = 'cca',
features = VariableFeatures(object = pbmc_rna),
reference.assay = 'RNA',
query.assay = 'RNA'
)
## Warning: 336 features of the features specified were not present in both the reference query assays.
## Continuing with remaining 1664 features.
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 8343 anchors
predicted.labels <- TransferData(
anchorset = transfer.anchors,
refdata = pbmc_rna$class,
weight.reduction = pbmc[['lsi']],
dims = 2:30
)
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
pbmc <- AddMetaData(object = pbmc, metadata = predicted.labels)
plot1 <- DimPlot(
object = pbmc_rna,
group.by = 'class',
label = TRUE,
repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')
plot2 <- DimPlot(
object = pbmc,
group.by = 'predicted.id',
label = TRUE,
repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')
plot1 + plot2
You can see that the scRNA-based classifications are consistent with the UMAP visualization that was computed using the scATAC-seq data.
# replace each label with its most likely prediction
for(i in levels(pbmc)) {
cells_to_reid <- WhichCells(pbmc, idents = i)
newid <- names(which.max(table(pbmc$predicted.id[cells_to_reid])))
Idents(pbmc, cells = cells_to_reid) <- newid
}
# change back to working with peaks instead of gene activities
DefaultAssay(pbmc) <- 'peaks'
da_peaks <- FindMarkers(
object = pbmc,
ident.1 = "Glutamatergic",
ident.2 = "GABAergic",
test.use = 'wilcox',
latent.vars = 'nCount_peaks'
)
head(da_peaks)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## chr2-70560793-70564431 8.740931e-171 -3.795395 0.072 0.521 1.351689e-165
## chr16-18143809-18157128 5.535275e-143 3.105369 0.750 0.130 8.559695e-138
## chr13-36729367-36737870 1.738086e-133 3.528217 0.687 0.063 2.687759e-128
## chr2-158609624-158611209 7.025103e-132 -3.697267 0.042 0.393 1.086355e-126
## chr3-34661345-34665438 4.254231e-118 -4.013360 0.040 0.359 6.578700e-113
## chr3-34648818-34651400 8.995583e-118 -3.917035 0.047 0.374 1.391068e-112
plot1 <- VlnPlot(
object = pbmc,
features = rownames(da_peaks)[1],
pt.size = 0.1,
idents = c("Glutamatergic","GABAergic")
)
plot2 <- FeaturePlot(
object = pbmc,
features = rownames(da_peaks)[1],
pt.size = 0.1
)
plot1 | plot2
Another way to find DA regions between two groups of cells is to look
at the fold change accessibility between two groups of cells. This can
be much faster than running more sophisticated DA tests, but is not able
to account for latent variables such as differences in the total
sequencing depth between cells, and does not perform any statistical
test. However, this can still be a useful way to quickly explore data,
and can be performed using the FoldChange() function in
Seurat.
fc <- FoldChange(pbmc, ident.1 = "Glutamatergic", ident.2 = "GABAergic")
# order by fold change
fc <- fc[order(fc$avg_log2FC, decreasing = TRUE), ]
head(fc)
## avg_log2FC pct.1 pct.2
## chr5-25115651-25116707 11.22418 0.249 0
## chr13-37196804-37197682 11.18483 0.213 0
## chr12-44138694-44139616 11.10417 0.186 0
## chr19-33240054-33240948 11.09364 0.197 0
## chr7-113160899-113161978 11.05582 0.196 0
## chr8-46729324-46730341 11.04008 0.183 0
Peak coordinates can be difficult to interpret alone. We can find the
closest gene to each of these peaks using the
ClosestFeature() function.
open_Glutamatergic <- rownames(da_peaks[da_peaks$avg_log2FC > 3, ])
open_GABAergic <- rownames(da_peaks[da_peaks$avg_log2FC < -3, ])
closest_genes_Glutamatergic <- ClosestFeature(pbmc, regions = open_Glutamatergic)
closest_genes_GABAergic <- ClosestFeature(pbmc, regions = open_GABAergic)
head(closest_genes_Glutamatergic)
## tx_id gene_name gene_id
## ENSMUST00000059589 ENSMUST00000059589 Rtn4r ENSMUSG00000043811
## ENSMUST00000122286 ENSMUST00000122286 Nrn1 ENSMUSG00000039114
## ENSMUST00000028179 ENSMUST00000028179 Fcnb ENSMUSG00000026835
## ENSMUST00000173592 ENSMUST00000173592 Ptprd ENSMUSG00000028399
## ENSMUST00000053568 ENSMUST00000053568 Lingo1 ENSMUSG00000049556
## ENSMUSE00000519801 ENSMUST00000056558 Zfp366 ENSMUSG00000050919
## gene_biotype type closest_region
## ENSMUST00000059589 protein_coding cds chr16-18150732-18152131
## ENSMUST00000122286 protein_coding cds chr13-36730522-36730624
## ENSMUST00000028179 protein_coding utr chr2-28076378-28076574
## ENSMUST00000173592 protein_coding gap chr4-77632380-77925509
## ENSMUST00000053568 protein_coding gap chr9-56653833-56674422
## ENSMUSE00000519801 protein_coding exon chr13-99184823-99184864
## query_region distance
## ENSMUST00000059589 chr16-18143809-18157128 0
## ENSMUST00000122286 chr13-36729367-36737870 0
## ENSMUST00000028179 chr2-28066441-28069312 7065
## ENSMUST00000173592 chr4-77647141-77650769 0
## ENSMUST00000053568 chr9-56656237-56659192 0
## ENSMUSE00000519801 chr13-99146072-99148616 36206
head(closest_genes_GABAergic)
## tx_id gene_name gene_id
## ENSMUST00000148210 ENSMUST00000148210 Gad1 ENSMUSG00000070880
## ENSMUST00000045738 ENSMUST00000045738 Slc32a1 ENSMUSG00000037771
## ENSMUST00000099151 ENSMUST00000099151 Sox2 ENSMUSG00000074637
## ENSMUST00000099151.1 ENSMUST00000099151 Sox2 ENSMUSG00000074637
## ENSMUST00000032454 ENSMUST00000032454 Slc6a1 ENSMUSG00000030310
## ENSMUST00000180353 ENSMUST00000180353 Sox1 ENSMUSG00000096014
## gene_biotype type closest_region
## ENSMUST00000148210 protein_coding cds chr2-70563832-70563910
## ENSMUST00000045738 protein_coding utr chr2-158610767-158611241
## ENSMUST00000099151 protein_coding utr chr3-34651376-34652461
## ENSMUST00000099151.1 protein_coding cds chr3-34650416-34651375
## ENSMUST00000032454 protein_coding utr chr6-114282635-114282876
## ENSMUST00000180353 protein_coding cds chr8-12396361-12397536
## query_region distance
## ENSMUST00000148210 chr2-70560793-70564431 0
## ENSMUST00000045738 chr2-158609624-158611209 0
## ENSMUST00000099151 chr3-34661345-34665438 8883
## ENSMUST00000099151.1 chr3-34648818-34651400 0
## ENSMUST00000032454 chr6-114282226-114284458 0
## ENSMUST00000180353 chr8-12394731-12397324 0
We can plot the frequency of Tn5 integration across regions of the
genome for cells grouped by cluster, cell type, or any other metadata
stored in the object for any genomic region using the
CoveragePlot() function. These represent pseudo-bulk
accessibility tracks, where signal from all cells within a group have
been averaged together to visualize the DNA accessibility in a
region
# set plotting order
levels(pbmc) <- c("Glutamatergic","Non-Neuronal", "GABAergic","Endothelial" )
CoveragePlot(
object = pbmc,
region = rownames(da_peaks)[1],
extend.upstream = 40000,
extend.downstream = 20000
)