This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# module load bcl2fastq
# module load cellranger/3.0.2
#
# i="201113_NB500916_0796_AHW3YVBGXG_bhupinderP_10Xv3"; cellranger mkfastq --id=M_N --localcores=32 --localmem=95 --run=$i --csv=sample_info.csv
# cellranger count --id=SI-GA-E9 --transcriptome=~/third_party_sofware/cell_ranger_references/refdata-cellranger-GRCh38-3.0.0/ --fastqs M_N/outs/fastq_path/HW3YVBGXG/PROSTATE --sample=PROSTATE --expect-cells=7000 --localmem=95
library(DropletUtils)
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::slice() masks IRanges::slice()
library(scran)
library(scater)
library(EnsDb.Hsapiens.v86)
## Loading required package: ensembldb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
##
## filter
## The following object is masked from 'package:stats':
##
## filter
library(SingleR)
library(SingleCellExperiment)
library(tidySingleCellExperiment)
##
## Attaching package: 'tidySingleCellExperiment'
## The following objects are masked from 'package:ensembldb':
##
## filter, select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:dplyr':
##
## add_count, bind_cols, bind_rows, count
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:stats':
##
## filter
counts =
"data/SI-GA-E9/outs/raw_feature_bc_matrix/" %>%
DropletUtils::read10xCounts(col.names = T) %>%
tidy()
## Warning: `tidy()` was deprecated in tidySingleCellExperiment 1.1.1.
## tidySingleCellExperiment says: tidy() is not needed anymore.
rownames(counts) <-
uniquifyFeatureNames(
rowData(counts)$ID,
rowData(counts)$Symbol
)
barcode_ranks =
counts %>%
counts() %>%
DropletUtils::barcodeRanks() %>%
{
.x = (.)
.x %>%
tibble::as_tibble(rownames = "cell") %>%
dplyr::mutate(
knee = metadata(.x)$knee,
inflection = metadata(.x)$inflection
) %>%
arrange(rank)
}
barcode_ranks%>%
sample_frac(0.05) %>%
ggplot2::ggplot(aes(rank, total)) +
geom_point(alpha=0.3) +
geom_line(aes(rank, fitted), color="red") +
geom_hline(aes(yintercept = knee), color="dodgerblue") +
geom_hline(aes(yintercept = inflection), color="forestgreen") +
scale_x_log10() +
scale_y_log10() +
theme_bw() +
theme(aspect.ratio=1)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 339649 row(s) containing missing values (geom_path).
counts_filter =
counts %>%
inner_join(
barcode_ranks %>% filter(total > inflection)
)
## Joining, by = "cell"
location <- mapIds(
EnsDb.Hsapiens.v86,
keys=rowData(counts_filter)$ID,
column="SEQNAME",
keytype="GENEID"
)
## Warning: Unable to map 312 of 33538 requested IDs.
counts_annotated =
counts_filter %>%
# Join mitochondrion statistics
left_join(
perCellQCMetrics(., subsets=list(Mito=which(location=="MT"))) %>%
as_tibble(rownames="cell"),
by="cell"
) %>%
# Label cells
mutate(high_mitochondrion = isOutlier(subsets_Mito_percent, type="higher"))
# Filter alive
counts_annotated %>%
plotColData(
y = "subsets_Mito_percent",
colour_by = "high_mitochondrion"
)
counts_alive =
counts_annotated %>%
filter(!high_mitochondrion)
counts_scaled <-
counts_alive %>%
computeSumFactors(cluster=quickCluster(counts_alive)) %>%
logNormCounts()
gene_variability <- modelGeneVarByPoisson(counts_scaled)
top_variable <- getTopHVGs(gene_variability, prop=0.1)
counts_reduction <-
counts_scaled %>%
denoisePCA(subset.row=top_variable, technical=gene_variability) %>%
runTSNE(dimred="PCA") %>%
runUMAP(dimred="PCA")
counts_cluster <-
counts_reduction %>%
mutate(
cluster =
buildSNNGraph(., k=10, use.dimred = 'PCA') %>%
igraph::cluster_louvain() %$%
membership %>%
as.factor()
)
plotUMAP(counts_cluster, colour_by="cluster",text_by="cluster")
blueprint <- BlueprintEncodeData()
## Warning: 'BlueprintEncodeData' is deprecated.
## Use 'celldex::BlueprintEncodeData' instead.
## See help("Deprecated")
## snapshotDate(): 2020-10-27
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
cell_type_inference = SingleR(test=counts_cluster, ref=blueprint, labels=blueprint$label.main)
counts_cell_type =
counts_cluster %>%
left_join(
cell_type_inference %>%
as_tibble(rownames = "cell") %>%
select(cell, cell_type = first.labels),
by="cell"
)
table(cell_type_inference$labels) %>% sort(decreasing = TRUE)
##
## Epithelial cells Adipocytes Skeletal muscle B-cells
## 1121 282 168 99
## CD4+ T-cells Chondrocytes Mesangial cells Melanocytes
## 90 56 51 48
## Endothelial cells Keratinocytes Fibroblasts Monocytes
## 45 42 35 31
## CD8+ T-cells HSC Neurons Astrocytes
## 21 12 12 11
## Pericytes NK cells Smooth muscle DC
## 10 9 8 5
## Myocytes Macrophages Eosinophils Erythrocytes
## 5 3 2 2
## Neutrophils
## 2
plotScoreHeatmap(cell_type_inference)