Analyzing PBMC scATAC-seq

We will then analyze an scATAC-seq data of adult mouse brain. You can download the data to be loaded into Seurat/Signac from given directory”C:/STAT646/Rcode/HW3_data”. Process the scATAC-seq, compute the gene activity matrix, and then perform cell-type label transfer from scRNA-seq (allen brain.rds) to scATAC-seq and provide visualization. Note that you will need to use the EnsDb.Mmusculus.v79 annotation package for the mouse mm10 genome.

First load in Signac, Seurat, and some other packages we will be using for analyzing human data.

if (!requireNamespace("EnsDb.Mmusculus.v79", quietly = TRUE))
    BiocManager::install("EnsDb.Mmusculus.v79")

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("biovizBase")

## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.2 (2023-10-31 ucrt)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'biovizBase'

## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.3.2/library
##   packages:
##     boot, cluster, codetools, foreign, lattice, MASS, Matrix, mgcv, nlme,
##     rpart, survival

## Old packages: 'matrixStats', 'reticulate', 'uwot', 'zlibbioc'

remotes::install_github("stuart-lab/signac", ref="develop")
devtools::install_github('immunogenomics/presto')
library(Signac)
library(Seurat)

## Warning: package 'Seurat' was built under R version 4.3.3

library(EnsDb.Mmusculus.v79)

## Warning: package 'GenomeInfoDb' was built under R version 4.3.3

## Warning: package 'GenomicFeatures' was built under R version 4.3.3

library(ggplot2)
library(patchwork)
library(presto)

## Warning: package 'data.table' was built under R version 4.3.3

library(biovizBase)

Pre-processing workflow

counts <- Read10X_h5(filename = "C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5")
metadata <- read.csv(
  file = "C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_singlecell.csv",
  header = TRUE,
  row.names = 1
)

chrom_assay <- CreateChromatinAssay(
  counts = counts,
  sep = c(":", "-"),
  fragments = 'C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz',
  min.cells = 10,
  min.features = 200
)

pbmc <- CreateSeuratObject(
  counts = chrom_assay,
  assay = "peaks",
  meta.data = metadata
)

pbmc

## An object of class Seurat 
## 154639 features across 4609 samples within 1 assay 
## Active assay: peaks (154639 features, 0 variable features)
##  2 layers present: counts, data

fragpath <- 'C:/STAT646/Rcode/HW3_data/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz'

# Define cells
# If you already have a list of cell barcodes to use you can skip this step
total_counts <- CountFragments(fragpath)
cutoff <- 1000 # Change this number depending on your dataset!
barcodes <- total_counts[total_counts$frequency_count > cutoff, ]$CB

# Create a fragment object
frags <- CreateFragmentObject(path = fragpath, cells = barcodes)

pbmc[['peaks']]

## ChromatinAssay data with 154639 features for 4609 cells
## Variable features: 0 
## Genome: 
## Annotation present: FALSE 
## Motifs present: FALSE 
## Fragment files: 1

granges(pbmc)

## GRanges object with 154639 ranges and 0 metadata columns:
##            seqnames            ranges strand
##               <Rle>         <IRanges>  <Rle>
##        [1]     chr1   3094708-3095552      *
##        [2]     chr1   3119414-3121782      *
##        [3]     chr1   3204809-3205178      *
##        [4]     chr1   3217330-3217359      *
##        [5]     chr1   3228123-3228463      *
##        ...      ...               ...    ...
##   [154635]     chrY 90761049-90762169      *
##   [154636]     chrY 90800417-90800831      *
##   [154637]     chrY 90804515-90805440      *
##   [154638]     chrY 90808545-90809111      *
##   [154639]     chrY 90810741-90810960      *
##   -------
##   seqinfo: 21 sequences from an unspecified genome; no seqlengths

# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)

# change to UCSC style since the data was mapped to mm10
seqlevels(annotations) <- paste0('chr', seqlevels(annotations))
genome(annotations) <- "mm10"

# add the gene information to the object
Annotation(pbmc) <- annotations

# compute nucleosome signal score per cell
pbmc <- NucleosomeSignal(object = pbmc)

# compute TSS enrichment score per cell
pbmc <- TSSEnrichment(object = pbmc, fast = FALSE)

# add blacklist ratio and fraction of reads in peaks
pbmc$pct_reads_in_peaks <- pbmc$peak_region_fragments / pbmc$passed_filters * 100
pbmc$blacklist_ratio <- pbmc$blacklist_region_fragments / pbmc$peak_region_fragments

DensityScatter(pbmc, x = 'nCount_peaks', y = 'TSS.enrichment', log_x = TRUE, quantiles = TRUE)

pbmc$high.tss <- ifelse(pbmc$TSS.enrichment > 3.4, 'High', 'Low')
TSSPlot(pbmc, group.by = 'high.tss') + NoLegend()

We can plot the distribution of each QC metric separately using a violin plot:

VlnPlot(
  object = pbmc,
  features = c('nCount_peaks', 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal', 'pct_reads_in_peaks'),
  pt.size = 0.1,
  ncol = 5
)

Finally we remove cells that are outliers for these QC metrics.

pbmc <- subset(
  x = pbmc,
  subset = nCount_peaks > 1000 &
    nCount_peaks < 900000 &
    pct_reads_in_peaks > 25 &
    blacklist_ratio < 0.05 &
    nucleosome_signal < 3 &
    TSS.enrichment > 3
)
pbmc

## An object of class Seurat 
## 154639 features across 4078 samples within 1 assay 
## Active assay: peaks (154639 features, 0 variable features)
##  2 layers present: counts, data

Normalization and linear dimensional reduction

pbmc <- RunTFIDF(pbmc)
pbmc <- FindTopFeatures(pbmc, min.cutoff = 'q0')
pbmc <- RunSVD(pbmc)

The first LSI component often captures sequencing depth (technical variation) rather than biological variation. If this is the case, the component should be removed from downstream analysis. We can assess the correlation between each LSI component and sequencing depth using the DepthCor() function:

DepthCor(pbmc)

Here we see there is a very strong correlation between the first LSI component and the total number of counts for the cell. We will perform downstream steps without this component as we don’t want to group cells together based on their total sequencing depth, but rather by their patterns of accessibility at cell-type-specific peaks.

Non-linear dimension reduction and clustering

Now that the cells are embedded in a low-dimensional space we can use methods commonly applied for the analysis of scRNA-seq data to perform graph-based clustering and non-linear dimension reduction for visualization. The functions RunUMAP(), FindNeighbors(), and FindClusters() all come from the Seurat package.

pbmc <- RunUMAP(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindNeighbors(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindClusters(object = pbmc, verbose = FALSE, algorithm = 3)
DimPlot(object = pbmc, label = TRUE) + NoLegend()

Create a gene activity matrix

To create a gene activity matrix, we extract gene coordinates and extend them to include the 2 kb upstream region (as promoter accessibility is often correlated with gene expression). We then count the number of fragments for each cell that map to each of these regions, using the using the FeatureMatrix() function. These steps are automatically performed by the GeneActivity() function:

gene.activities <- GeneActivity(pbmc)

# add the gene activity matrix to the Seurat object as a new assay and normalize it
pbmc[['RNA']] <- CreateAssayObject(counts = gene.activities)
pbmc <- NormalizeData(
  object = pbmc,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(pbmc$nCount_RNA)
)

Now we can visualize the activities of canonical marker genes to help interpret our ATAC-seq clusters.

DefaultAssay(pbmc) <- 'RNA'

#Apoe, Ptgds, Apod, Itm2a, Ly6a, Ly6c1, Vtn, Plp1,
#Pltp, Myl9
FeaturePlot(
  object = pbmc,
  features = c('Gse1', 'Tcf4', 'Hivep3', 'Phactr1', 'Arid1b', 'Trio'),
  pt.size = 0.1,
  max.cutoff = 'q95',
  ncol = 3
)

Integrating with scRNA-seq data

# Load the pre-processed scRNA-seq data for PBMCs
pbmc_rna <- readRDS("C:/STAT646/Rcode/HW3_data/allen_brain.rds")
pbmc_rna <- UpdateSeuratObject(pbmc_rna)

transfer.anchors <- FindTransferAnchors(
  reference = pbmc_rna,
  query = pbmc,
  reduction = 'cca',
  features = VariableFeatures(object = pbmc_rna),
  reference.assay = 'RNA',
  query.assay = 'RNA'
)

## Warning: 336 features of the features specified were not present in both the reference query assays. 
## Continuing with remaining 1664 features.

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 8343 anchors

predicted.labels <- TransferData(
  anchorset = transfer.anchors,
  refdata = pbmc_rna$class,
  weight.reduction = pbmc[['lsi']],
  dims = 2:30
)

## Finding integration vectors

## Finding integration vector weights

## Predicting cell labels

pbmc <- AddMetaData(object = pbmc, metadata = predicted.labels)

plot1 <- DimPlot(
  object = pbmc_rna,
  group.by = 'class',
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')

plot2 <- DimPlot(
  object = pbmc,
  group.by = 'predicted.id',
  label = TRUE,
  repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')

plot1 + plot2

You can see that the scRNA-based classifications are consistent with the UMAP visualization that was computed using the scATAC-seq data.

# replace each label with its most likely prediction
for(i in levels(pbmc)) {
  cells_to_reid <- WhichCells(pbmc, idents = i)
  newid <- names(which.max(table(pbmc$predicted.id[cells_to_reid])))
  Idents(pbmc, cells = cells_to_reid) <- newid
}

Find differentially accessible peaks between cell types

# change back to working with peaks instead of gene activities
DefaultAssay(pbmc) <- 'peaks'

da_peaks <- FindMarkers(
  object = pbmc,
  ident.1 = "Glutamatergic",
  ident.2 = "GABAergic",
  test.use = 'wilcox',
  latent.vars = 'nCount_peaks'
)
head(da_peaks)

##                                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## chr2-70560793-70564431   8.740931e-171  -3.795395 0.072 0.521 1.351689e-165
## chr16-18143809-18157128  5.535275e-143   3.105369 0.750 0.130 8.559695e-138
## chr13-36729367-36737870  1.738086e-133   3.528217 0.687 0.063 2.687759e-128
## chr2-158609624-158611209 7.025103e-132  -3.697267 0.042 0.393 1.086355e-126
## chr3-34661345-34665438   4.254231e-118  -4.013360 0.040 0.359 6.578700e-113
## chr3-34648818-34651400   8.995583e-118  -3.917035 0.047 0.374 1.391068e-112

plot1 <- VlnPlot(
  object = pbmc,
  features = rownames(da_peaks)[1],
  pt.size = 0.1,
  idents = c("Glutamatergic","GABAergic")
)
plot2 <- FeaturePlot(
  object = pbmc,
  features = rownames(da_peaks)[1],
  pt.size = 0.1
)

plot1 | plot2

Another way to find DA regions between two groups of cells is to look at the fold change accessibility between two groups of cells. This can be much faster than running more sophisticated DA tests, but is not able to account for latent variables such as differences in the total sequencing depth between cells, and does not perform any statistical test. However, this can still be a useful way to quickly explore data, and can be performed using the FoldChange() function in Seurat.

fc <- FoldChange(pbmc, ident.1 = "Glutamatergic", ident.2 = "GABAergic")
# order by fold change
fc <- fc[order(fc$avg_log2FC, decreasing = TRUE), ]
head(fc)

##                          avg_log2FC pct.1 pct.2
## chr5-25115651-25116707     11.22418 0.249     0
## chr13-37196804-37197682    11.18483 0.213     0
## chr12-44138694-44139616    11.10417 0.186     0
## chr19-33240054-33240948    11.09364 0.197     0
## chr7-113160899-113161978   11.05582 0.196     0
## chr8-46729324-46730341     11.04008 0.183     0

Peak coordinates can be difficult to interpret alone. We can find the closest gene to each of these peaks using the ClosestFeature() function.

open_Glutamatergic <- rownames(da_peaks[da_peaks$avg_log2FC > 3, ])
open_GABAergic <- rownames(da_peaks[da_peaks$avg_log2FC < -3, ])

closest_genes_Glutamatergic <- ClosestFeature(pbmc, regions = open_Glutamatergic)
closest_genes_GABAergic <- ClosestFeature(pbmc, regions = open_GABAergic)

head(closest_genes_Glutamatergic)

##                                 tx_id gene_name            gene_id
## ENSMUST00000059589 ENSMUST00000059589     Rtn4r ENSMUSG00000043811
## ENSMUST00000122286 ENSMUST00000122286      Nrn1 ENSMUSG00000039114
## ENSMUST00000028179 ENSMUST00000028179      Fcnb ENSMUSG00000026835
## ENSMUST00000173592 ENSMUST00000173592     Ptprd ENSMUSG00000028399
## ENSMUST00000053568 ENSMUST00000053568    Lingo1 ENSMUSG00000049556
## ENSMUSE00000519801 ENSMUST00000056558    Zfp366 ENSMUSG00000050919
##                      gene_biotype type          closest_region
## ENSMUST00000059589 protein_coding  cds chr16-18150732-18152131
## ENSMUST00000122286 protein_coding  cds chr13-36730522-36730624
## ENSMUST00000028179 protein_coding  utr  chr2-28076378-28076574
## ENSMUST00000173592 protein_coding  gap  chr4-77632380-77925509
## ENSMUST00000053568 protein_coding  gap  chr9-56653833-56674422
## ENSMUSE00000519801 protein_coding exon chr13-99184823-99184864
##                               query_region distance
## ENSMUST00000059589 chr16-18143809-18157128        0
## ENSMUST00000122286 chr13-36729367-36737870        0
## ENSMUST00000028179  chr2-28066441-28069312     7065
## ENSMUST00000173592  chr4-77647141-77650769        0
## ENSMUST00000053568  chr9-56656237-56659192        0
## ENSMUSE00000519801 chr13-99146072-99148616    36206

head(closest_genes_GABAergic)

##                                   tx_id gene_name            gene_id
## ENSMUST00000148210   ENSMUST00000148210      Gad1 ENSMUSG00000070880
## ENSMUST00000045738   ENSMUST00000045738   Slc32a1 ENSMUSG00000037771
## ENSMUST00000099151   ENSMUST00000099151      Sox2 ENSMUSG00000074637
## ENSMUST00000099151.1 ENSMUST00000099151      Sox2 ENSMUSG00000074637
## ENSMUST00000032454   ENSMUST00000032454    Slc6a1 ENSMUSG00000030310
## ENSMUST00000180353   ENSMUST00000180353      Sox1 ENSMUSG00000096014
##                        gene_biotype type           closest_region
## ENSMUST00000148210   protein_coding  cds   chr2-70563832-70563910
## ENSMUST00000045738   protein_coding  utr chr2-158610767-158611241
## ENSMUST00000099151   protein_coding  utr   chr3-34651376-34652461
## ENSMUST00000099151.1 protein_coding  cds   chr3-34650416-34651375
## ENSMUST00000032454   protein_coding  utr chr6-114282635-114282876
## ENSMUST00000180353   protein_coding  cds   chr8-12396361-12397536
##                                  query_region distance
## ENSMUST00000148210     chr2-70560793-70564431        0
## ENSMUST00000045738   chr2-158609624-158611209        0
## ENSMUST00000099151     chr3-34661345-34665438     8883
## ENSMUST00000099151.1   chr3-34648818-34651400        0
## ENSMUST00000032454   chr6-114282226-114284458        0
## ENSMUST00000180353     chr8-12394731-12397324        0

Plotting genomic regions

We can plot the frequency of Tn5 integration across regions of the genome for cells grouped by cluster, cell type, or any other metadata stored in the object for any genomic region using the CoveragePlot() function. These represent pseudo-bulk accessibility tracks, where signal from all cells within a group have been averaged together to visualize the DNA accessibility in a region

# set plotting order
levels(pbmc) <- c("Glutamatergic","Non-Neuronal", "GABAergic","Endothelial" )

CoveragePlot(
  object = pbmc,
  region = rownames(da_peaks)[1],
  extend.upstream = 40000,
  extend.downstream = 20000
)