Background

Biology

The biological question is whether it is possible to define differences between Tmem and Teffector cells at the single cell level. The system uses MRSV infection of mice followed by isolation at the time of infection to get T-effector cells and after convalesence to get T-memory cells. The cells can be isolated because a particular cell membrane protein is associated with T-cell activity against MRSV.

Experimental design

Three sets of experiments have been done using a kit and plate from Cellular Research. This is a 96-well single-cell prep system that results in 96 cells represented in a library. The three experiments included

  • One set of cells preped and sequenced on MiSeq
  • The same library sequenced on HiSeq
  • A second set of cells prepped and sequenced on HiSeq

Here, I am focusing on only the last experiment (Library2_Hiseq). For now, I am simply using the output from the pipeline that was run on SevenBridges.

Data preparation

library(readr)
library(Rtsne)

L2_MIM = read_csv('Library2_Hiseq/MolecularIndexMapping_Library2_Hiseq.csv')
L2_RM = read_csv('Library2_Hiseq/ReadMapping_Library2_Hiseq.csv')
L2_MIM_mat = as.matrix(L2_MIM[-c(1:3),-c(1:2)])
rownames(L2_MIM_mat) = as.character(L2_MIM[-c(1:3),1])
L2_RM_mat  = as.matrix(L2_RM[-c(1:3),-c(1:2)])
rownames(L2_RM_mat) = as.character(L2_RM[-c(1:3),1])

Here I am using a SummarizedExperiment container for the data, just to keep things organized.

library(SummarizedExperiment)
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
se = SummarizedExperiment(assays=list(molindexmap=L2_MIM_mat,readmap = L2_RM_mat))
tmp = rep('invitro',ncol(se))
tmp[grep('Memory',colnames(se))]='Memory'
tmp[grep('Effector',colnames(se))]='Effector'
colData(se) = DataFrame(Class = tmp,t(L2_MIM[1:3,-c(1:2)]))
colnames(colData(se))[2:4]=L2_MIM[1:3,1]

Analysis

Explorations

The dataset resulted in 1504280 molecular index counts for the full dataset with 60983817

hist(log10(assays(se)$molindexmap),main='histogram of molecular index counts per gene/cell',
     xlab="log10(counts)")

hist(log10(assays(se)$readmap),main='histogram of read counts per gene/cell',
     xlab="log10(counts)")

smoothScatter(log2(assays(se)$molindexmap),assays(se)$readmap/(assays(se)$molindexmap+1))

hist(log10(assays(se)$readmap/(assays(se)$molindexmap)))

To get a sense of the number of detected genes, just sum the number of genes for which the counts>X. In this case, I chose X to be 2.

countthresh = 2
ngenes = apply(assays(se)$molindexmap,2,function(x) {sum(x>countthresh)})
hist(ngenes)

To get a more quantitative sense of the number of genes, we can look at the quantiles of number of expressed genes.

quantile(ngenes,seq(0,1,0.1))
##     0%    10%    20%    30%    40%    50%    60%    70%    80%    90% 
##  294.0  387.0  412.0  431.5  460.0  498.5  539.0  597.5  668.0 1228.0 
##   100% 
## 4357.0
barplot(ngenes)

There are some samples with much higher numbers of detected genes (20x difference from lowest to highest). To investigate the extent to which sequencing depth affects the number of detected genes, I plot the depth vs detected genes on log-log scale.

readcount = colSums(assays(se)$molindexmap)
plot(readcount,ngenes,log='xy',main='Log-Log plot of readcount vs number of detected genes')

sampleMedian = apply(assays(se)$molindexmap,2,function(x) {return(median(x[x>0]))})
plot(sampleMedian,ngenes,log='xy',main='Log-Log plot of readcount vs number of detected genes')

hist(log10(L2_MIM_mat),main='Count of MIM per gene/sample')

This next plot shows the number of genes expresssed in X samples.
Note that most genes are expressed in very few samples. However, there are a few hundred that are expressed in more than 30% of the samples.

samplecountPerGene = rowSums(L2_MIM_mat>0)
hist(samplecountPerGene[samplecountPerGene>0])

library(Rtsne)
#s = Rtsne(t(L2_MIM_mat),dims=4)
g = Rtsne(t(log2(L2_MIM_mat+1)),dims=4)

cvec = rep('black',96)
cvec[grep('Effector',colnames(L2_MIM_mat))]='green'
cvec[grep('Memory',colnames(L2_MIM_mat))]='red'
pairs(g$Y,col=cvec)

p = prcomp(t(log2(L2_MIM_mat+1)))

pairs(p$x[,1:4],col=cvec,main='pca')

ts = Rtsne(p$x[,1:4],dims=4)

pairs(ts$Y,col=cvec,main='tsne on pca')

Monocle

library(monocle)
## Loading required package: HSMMSingleCell
## Loading required package: ggplot2
## Loading required package: splines
## Loading required package: VGAM
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     compare
## The following objects are masked from 'package:GenomicRanges':
## 
##     compare, union
## The following objects are masked from 'package:IRanges':
## 
##     compare, simplify, union
## The following objects are masked from 'package:S4Vectors':
## 
##     compare, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## No methods found in "BiocGenerics" for requests: as.vector, unlist
pd = new("AnnotatedDataFrame",data=data.frame(colData(se)))
fd = new('AnnotatedDataFrame',data=data.frame(gene=rownames(se)))
rownames(fd)=rownames(se)
mset = newCellDataSet(L2_MIM_mat,phenoData=pd,featureData=fd)
mset = mset[,pData(mset)$Class %in% c("Memory","Effector")]
mset = detectGenes(mset,min_expr=1)
expressedGenes = rownames(subset(fData(mset),num_cells_expressed >= 30))
library(reshape)
## 
## Attaching package: 'reshape'
## The following objects are masked from 'package:plyr':
## 
##     rename, round_any
## The following object is masked from 'package:IRanges':
## 
##     expand
## The following object is masked from 'package:S4Vectors':
## 
##     rename
library(ggplot2)
# log transform
L = log2(exprs(mset[expressedGenes,]))
# and standardize to same scale
melted_dens_df <- melt(t(scale(t(L))))
qplot(value, geom="density", data=melted_dens_df) +
  stat_function(fun = dnorm, size=0.5, color="red") +
  xlab("Standardized log(count)") +
  ylab("Density")
## Warning: Removed 128226 rows containing non-finite values (stat_density).

Differential expression

diff_test_res = differentialGeneTest(mset[expressedGenes,],
                                     fullModelFormulaStr = 'expression ~ Class')
hist(diff_test_res$pval)

hist(diff_test_res$qval)

And the results of the testing:

library(DT)
## 
## Attaching package: 'DT'
## The following object is masked from 'package:igraph':
## 
##     %>%
datatable(diff_test_res)

Monocle analysis

ordering_genes <- row.names (subset(diff_test_res, qval < 0.1))
msetsub = mset[expressedGenes,]
msetsub = setOrderingFilter(msetsub,ordering_genes)
msetsub <- reduceDimension(msetsub, use_irlba=FALSE)
## Reducing to independent components
msetsub <- orderCells(msetsub, num_paths=2, reverse=TRUE)
plot_spanning_tree(msetsub)

table(pData(msetsub)$Class,pData(msetsub)$State)
##           
##             1  2  3
##   Effector 13 28  8
##   Memory    2 30  5
datatable(pData(msetsub))