Advanced Chapter 14, Dealing with Large Data
Multi-Sample analysis Chapter 1
Super computer did not work last week! Here is why:
Normally R puts everything into memory for fast processing, but single cell RNAseq is very often too big to fit on the processor. The package solved it automatically for me by an out-of-memory representation (H5):
H5 objects solve memory limit problems by
H5 is usually easy and awesome! But, problems with H5 are:
Still, using H5 is recommended generally
The other option is to summarize data into many centroids via k-means or k-nearest neighbor.
The example to the right shows the sce richard data with an increasing number of k nearest neighbor centeroids. As k gets larger, most of the data can be summarized into proximal centeroids.
Some approaches (e.g. metacell) just combine the data from nearby cells. Other approaches, like BioCNeighbors, creates a centeroid index file to retrieve information for cell close to chosen centeroid(s).
viewof num_centroids = slider({
min: 2,
max: 100,
step: 1,
value: 3,
description: 'Number of centroids'
})viewof selection = {
const svg = d3.select(DOM.svg(width, height));
svg.append("g")
.call(xAxis);
svg.append("g")
.call(yAxis);
var show_colour=true;
var show_centroids=true;
var cluster_index=stepslider-5;
var centroids_index=stepslider-5;
var show_text=false;
var text="";
if(stepslider==1) {
cluster_index=0;
centroids_index=0;
show_colour=false;
show_centroids=false;
show_text=false;
text="To start with, we have some unlabelled data";
} else if (stepslider==2) {
cluster_index=0;
centroids_index=0;
show_colour=false;
show_centroids=true;
show_text=false;
text="We choose some initial centroids by picking random datapoints";
} else if (stepslider==3) {
cluster_index=0;
centroids_index=0;
show_colour=true;
show_centroids=true;
show_text=false;
text="All the datapoints are allocated to their nearest centroid";
} else if (stepslider==4) {
cluster_index=0;
centroids_index=1;
show_colour=true;
show_centroids=true;
show_text=false;
text="We then move the centroids to the mean of its allocated datapoints";
} else if (stepslider==5) {
cluster_index=0;
centroids_index=1;
show_colour=false;
show_centroids=true;
show_text=false;
text="We then remove the old allocation to make a new one";
} else if (stepslider==6) {
cluster_index=1;
centroids_index=1;
show_colour=true;
show_centroids=true;
show_text=false;
text="The steps of allocating datapoints and moving the centroids are repeated until convergence";
} else if (stepslider==clusters.centroids.length+4) {
show_text=false;
text="The centroids no longer move and the algorithm has converged";
}
const dot = svg.append("g")
.attr("fill", "none")
.attr("stroke-width", 1.5)
.selectAll("g")
.data(points)
.join("circle")
.attr("transform", d => `translate(${x(d.x)},${y(d.y)})`)
.attr("stroke", (d, i) => show_colour ? colours[clusters["clusters"][cluster_index][i]] : "gray" )
.attr("r", 3);
if(show_centroids) {
const cent = svg.append("g")
.attr("stroke-width", 1.5)
.selectAll("g")
.data(clusters["centroids"][centroids_index])
.join("circle")
.attr("transform", d => `translate(${x(d.x)},${y(d.y)})`)
.attr("fill", (d, i) => colours[i])
.attr("stroke", "black" )
.attr("r", 8);
}
if(show_text) {
const explaination = svg.append("g")
.append("text")
.attr("x", width/2)
.attr("y", height-16)
.attr("text-anchor", "middle")
.text(text);
}
return svg.node();
}viewof stepslider = slider({
min: 1,
max: clusters.centroids.length+4,
step: 1,
value: 11,
description: 'Step'
})closest = function(p, l) {
var best=0;
var best_dist=dist(p,l[0]);
for(var i=1; i<l.length; i++) {
if(best_dist>dist(p, l[i])) {
best=i;
best_dist=dist(p, l[i]);
}
}
return best;
}clusters = {
var clusters_object={};
clusters_object['centroids']=[];
clusters_object['clusters']=[];
var initial_centroids=[];
for(var i=0; i<num_centroids; i++) {
initial_centroids.push({'x': points[i]['x'], 'y': points[i]['y'] });
}
clusters_object['centroids'].push(initial_centroids);
var initial_clusters=[];
for(var i=0; i<points.length; i++) {
initial_clusters.push(closest(points[i], clusters_object['centroids'][0]));
}
clusters_object['clusters'].push(initial_clusters);
for(var i=1; i<22; i++) {
let new_centroids=centroid_means(clusters_object['clusters'][i-1]);
let converged=true;
for(var j=0; j<num_centroids; j++) {
if(isNaN(new_centroids[j].x)) {
new_centroids[j]=clusters_object['centroids'][i-1][j];
}
if(new_centroids[j].x != clusters_object['centroids'][i-1][j].x
|| new_centroids[j].y != clusters_object['centroids'][i-1][j].y) {
converged=false;
}
}
if(converged) {
break;
}
clusters_object['centroids'].push(new_centroids);
clusters_object['clusters'].push(points.map(function(x) {
return closest(x,clusters_object['centroids'][i])
}));
}
return clusters_object;
}centroid_means = function(l) {
var sums=[];
var counts=[];
var num_centroids=0;
for(var i=0; i<l.length; i++) {
if(l[i]>num_centroids-1) {
num_centroids=l[i]+1;
}
}
for(var i=0; i<num_centroids; i++) {
sums.push({'x': 0.0, 'y':0.0});
counts.push(0);
}
for(var i=0; i<points.length; i++) {
sums[l[i]]['x']+=points[i]['x'];
sums[l[i]]['y']+=points[i]['y'];
counts[l[i]]++;
}
var c=[];
for(var i=0; i<num_centroids; i++) {
c.push({'x': sums[i]['x']/counts[i], 'y': sums[i]['y']/counts[i]});
}
return c;
}xAxis = g => g
.attr("transform", `translate(0,${height - margin.bottom})`)
.call(d3.axisBottom(x))
.call(g => g.select(".domain").remove())
.call(g => g.append("text")
.attr("x", width - margin.right)
.attr("y", -4)
.attr("fill", "#000")
.attr("font-weight", "bold")
.attr("text-anchor", "end")
.text(points.x))yAxis = g => g
.attr("transform", `translate(${margin.left},0)`)
.call(d3.axisLeft(y))
.call(g => g.select(".domain").remove())
.call(g => g.select(".tick:last-of-type text").clone()
.attr("x", 4)
.attr("text-anchor", "start")
.attr("font-weight", "bold")
.text(points.y))randn_bm = function(random_generator) {
var u = 0, v = 0;
while(u === 0) u = random_generator.random(); //Converting [0,1) to (0,1)
while(v === 0) v = random_generator.random();
return Math.sqrt( -2.0 * Math.log( u ) ) * Math.cos( 2.0 * Math.PI * v );
}colours = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf","#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]There are two main package repositories for us, CRAN (general R stuff) and bioconducter (biology stuff)
From CRAN
From Bioconducter
(we also learned how to load objects and the package::function thing)
To load a r object, you can do it with a read_rds() command:
The SingleCellExperiment objcet class is similar to many, many bioinfomatic structures.
(also we learned graphing in R via ggplot2)
There are inherent chalanges to processing single cell RNAseq including sparsity and dead cell contamination.
A strict cut off can exclude real cells.
(we also learned pipes |>, dplyr filtering, and more ggplot2)
How we identify cell groups, but useful in most bioinfomatics. We discussed three ways:
(connects cells to closest k cells. A high K gives few clusters, small K gives many.)
Pro: works on big and small datasets
con: inflates clusters in high density areas.
(connects cells to closest centroid of k total)
Pro: quick!
con: does not give reliable results - use for over/under clustering
(make a tree, cut tree for clusters)
Pro: accurate and tunable
con: computationally expensive. use vector/k-means first
Different averages (mean, median, max, min) and statistics (AUC, Coehn’s FC) give different, but equally valid results. A p value is not valid.
Minimum rank with a log fold change threshold is the most stringent.
# r code
library(SingleCellExperiment)
library(scater)
library(tidyverse)
1sce <- read_rds("sce_dc.Rds")
markers<- scoreMarkers(sce,
sce$CellType ,
lfc=2)
2markers$DC6 |>
data.frame() |>
arrange(desc(min.AUC)) |>
round(2) |>
rownames_to_column(var="Symbol") |>
write_csv("DC6.csv)There are many ways each with their own limitations. We used a spearman’s correlation between two datasets
Fundamental issues with comparing two single cell RNAseq datasets:
Single-cell transcriptomics identifies prothymosin α restriction of HIV-1 in vivo
I went to the paper, Copied the GSE number
I searched the GSE233747 on the GEO website site and clicked the link
From each sample page, I scrolled down and downloaded each of the three files, re-named them, and put them in a folder
For Thursday worksheet, you will get the pipeline below, and multiple samples. Then, you will need to run the pipeline on all three samples and either cluster or test differential expression.
# Class 3 - Quality control .
df <- perCellQCMetrics(sce)
outliers <- (isOutlier(df$sum, type="lower", log=TRUE) | isOutlier(df$sum, type="higher", log=TRUE))
sce <- sce[, !outliers]
# Normalization and feature selection -skipped
sce <- logNormCounts(sce)
dec <- modelGeneVar(sce)
hvg <- getTopHVGs(dec, prop=0.1)
# PCA - skipped
set.seed(1234)
sce <- runPCA(sce, ncomponents=25, subset_row=hvg)
# class 4 - Clustering - class5
set.seed(100)
sce$clusters <- clusterCells(sce,
use.dimred="PCA",
BLUSPARAM=KmeansParam(centers=5))
rownames(sce) <- rowData(sce)$Symbol
labels_for_cell_id <- SingleR(test=sce, ref=ref, labels=ref$label.main)
sce$pred <- labels_for_cell_id$labels
# Visualization.
sce <- runTSNE(sce, dimred = 'PCA')
sce <- runUMAP(sce, dimred = 'PCA')Reminder: in R, it is very easy to make a new function. For Thursday, you will be putting in a singleCellExperiment (sce) into the pipeline and getting out a singleCellExperiment. Life will be better if you make a function out of your script.
Then you can run the script cleanly rather than copy and pasting everything
ScRNAseq assays usually generate data across multiple batches - either replicates, conditions, or getting online data. That creates uncontrollable problems: changes in wet lab processing, changes in reagent quality, etc.
Thus, there are usually systematic differences between replicates, refereed to as “batch effects.” Examples:
The following 3 PBMC scRNAseq were done by the same group, but at different times.
The gray T cells (and all other cell types) separate by batch and cell type
The following 2 bone marrow were done at the same time with same reagents
The missing cells in the knockout are naive B and T cells, and more T mem cells
How to batch correct is an evolving field. Usually, scRNA datasets are first scaled and then options include:
Maybe scale the data
Pro: you don’t make up results
Con: embedding will reflect reagents, not biology
For each gene, scaled down value until equal to lowest mean across batches.
pro: Good when you know populations should be the same
con: Assumes same cells are there in all the samples & only kind of works.
Run K nearest neighbor, but allow closest cells across batches. Move neighbors together.
Pro: it works, and is tunable by changing K (more K is more together)
Con: You made up your results
You can not use the scaled values for differential expression (DE). They are only for embedding
Differential expression (DE) analysis is built upon bulk RNAseq, which requires replicates.
It is popular to not do replicates in single cell RNAseq, which would break budgets. That does not change the fact that it is not appropriate to treat cells as replicates and then apply a p value:
A solution is to collapse clusters into pseudo replicates, and do traditional differential expression analysis across replicates.