primary_prostate_quality

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# module load bcl2fastq
# module load cellranger/3.0.2
#
# i="201113_NB500916_0796_AHW3YVBGXG_bhupinderP_10Xv3"; cellranger mkfastq --id=M_N --localcores=32 --localmem=95 --run=$i  --csv=sample_info.csv
# cellranger count --id=SI-GA-E9 --transcriptome=~/third_party_sofware/cell_ranger_references/refdata-cellranger-GRCh38-3.0.0/ --fastqs M_N/outs/fastq_path/HW3YVBGXG/PROSTATE --sample=PROSTATE --expect-cells=7000 --localmem=95

library(DropletUtils)

## Loading required package: SingleCellExperiment

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse()   masks IRanges::collapse()
## x dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count()      masks matrixStats::count()
## x dplyr::desc()       masks IRanges::desc()
## x tidyr::expand()     masks S4Vectors::expand()
## x dplyr::filter()     masks stats::filter()
## x dplyr::first()      masks S4Vectors::first()
## x dplyr::lag()        masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename()     masks S4Vectors::rename()
## x dplyr::slice()      masks IRanges::slice()

library(scran)
library(scater)
library(EnsDb.Hsapiens.v86)

## Loading required package: ensembldb

## Loading required package: GenomicFeatures

## Loading required package: AnnotationDbi

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: AnnotationFilter

## 
## Attaching package: 'ensembldb'

## The following object is masked from 'package:dplyr':
## 
##     filter

## The following object is masked from 'package:stats':
## 
##     filter

library(SingleR)
library(SingleCellExperiment)
library(tidySingleCellExperiment)

## 
## Attaching package: 'tidySingleCellExperiment'

## The following objects are masked from 'package:ensembldb':
## 
##     filter, select

## The following object is masked from 'package:AnnotationDbi':
## 
##     select

## The following objects are masked from 'package:dplyr':
## 
##     add_count, bind_cols, bind_rows, count

## The following object is masked from 'package:IRanges':
## 
##     slice

## The following object is masked from 'package:S4Vectors':
## 
##     rename

## The following object is masked from 'package:matrixStats':
## 
##     count

## The following object is masked from 'package:stats':
## 
##     filter

Import data

counts = 
  "data/SI-GA-E9/outs/raw_feature_bc_matrix/" %>%
  DropletUtils::read10xCounts(col.names = T) %>%
  tidy()

## Warning: `tidy()` was deprecated in tidySingleCellExperiment 1.1.1.
## tidySingleCellExperiment says: tidy() is not needed anymore.

rownames(counts) <- 
  uniquifyFeatureNames(
    rowData(counts)$ID,
    rowData(counts)$Symbol
  )

Calculate bar-codes ranks

barcode_ranks = 
  counts %>%
  counts() %>%
  DropletUtils::barcodeRanks() %>%
  {
    .x = (.)
    .x %>%
      tibble::as_tibble(rownames = "cell") %>%
      dplyr::mutate(
        knee =  metadata(.x)$knee,
        inflection =  metadata(.x)$inflection
      ) %>%
      arrange(rank)
  }
  

barcode_ranks%>%
  sample_frac(0.05) %>%
  ggplot2::ggplot(aes(rank, total)) + 
  geom_point(alpha=0.3) + 
  geom_line(aes(rank, fitted), color="red") + 
  geom_hline(aes(yintercept = knee), color="dodgerblue") +
  geom_hline(aes(yintercept = inflection), color="forestgreen") +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_bw() + 
  theme(aspect.ratio=1)

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 339649 row(s) containing missing values (geom_path).

Filter real cells

counts_filter = 
  counts %>%
  inner_join(
    barcode_ranks %>% filter(total > inflection)
  )

## Joining, by = "cell"

Annotate mitochondial

location <- mapIds(
  EnsDb.Hsapiens.v86, 
  keys=rowData(counts_filter)$ID, 
  column="SEQNAME",
  keytype="GENEID"
)

## Warning: Unable to map 312 of 33538 requested IDs.

counts_annotated = 
  counts_filter %>%
  
  # Join mitochondrion statistics
  left_join(
    perCellQCMetrics(., subsets=list(Mito=which(location=="MT"))) %>%
      as_tibble(rownames="cell"),
    by="cell"
  ) %>%
  
  # Label cells
  mutate(high_mitochondrion = isOutlier(subsets_Mito_percent, type="higher")) 

# Filter alive
counts_annotated %>%
  plotColData(
    y = "subsets_Mito_percent",
    colour_by = "high_mitochondrion"
  )

Filter alive

counts_alive = 
  counts_annotated %>%
  filter(!high_mitochondrion)

Scale counts

counts_scaled <- 
  counts_alive %>%
  computeSumFactors(cluster=quickCluster(counts_alive)) %>%
  logNormCounts()

UMAP

gene_variability <- modelGeneVarByPoisson(counts_scaled)
top_variable <- getTopHVGs(gene_variability, prop=0.1)

counts_reduction <- 
  counts_scaled %>%
  denoisePCA(subset.row=top_variable, technical=gene_variability) %>%
  runTSNE(dimred="PCA") %>%
  runUMAP(dimred="PCA")

Clustering

counts_cluster <-
  counts_reduction %>%
  mutate(
    cluster = 
      buildSNNGraph(., k=10, use.dimred = 'PCA') %>%
      igraph::cluster_louvain() %$%
      membership %>%
      as.factor()
  )

plotUMAP(counts_cluster, colour_by="cluster",text_by="cluster")

Cell type clasification

blueprint <- BlueprintEncodeData()

## Warning: 'BlueprintEncodeData' is deprecated.
## Use 'celldex::BlueprintEncodeData' instead.
## See help("Deprecated")

## snapshotDate(): 2020-10-27

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

cell_type_inference = SingleR(test=counts_cluster, ref=blueprint, labels=blueprint$label.main)

counts_cell_type = 
  counts_cluster %>%
  left_join(
    cell_type_inference %>% 
      as_tibble(rownames = "cell") %>%
      select(cell, cell_type = first.labels),
    by="cell"
  )

table(cell_type_inference$labels) %>% sort(decreasing = TRUE)

## 
##  Epithelial cells        Adipocytes   Skeletal muscle           B-cells 
##              1121               282               168                99 
##      CD4+ T-cells      Chondrocytes   Mesangial cells       Melanocytes 
##                90                56                51                48 
## Endothelial cells     Keratinocytes       Fibroblasts         Monocytes 
##                45                42                35                31 
##      CD8+ T-cells               HSC           Neurons        Astrocytes 
##                21                12                12                11 
##         Pericytes          NK cells     Smooth muscle                DC 
##                10                 9                 8                 5 
##          Myocytes       Macrophages       Eosinophils      Erythrocytes 
##                 5                 3                 2                 2 
##       Neutrophils 
##                 2

plotScoreHeatmap(cell_type_inference)

primary_prostate_quality_control