MAITAnalysis

suppressPackageStartupMessages({
    library(ggplot2)
    library(GGally)
    library(GSEABase)
    library(limma)
    library(reshape2)
    library(data.table)
    library(knitr)
    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
    library(stringr)
    library(NMF)
    library(rsvd)
    library(RColorBrewer)
    library(MAST)
  # library(HDF5Array)
  library(pbapply)
  # library(mbenchmark)
})
#options(mc.cores = detectCores() - 1) #if you have multiple cores to spin
options(mc.cores = 1)
# knitr::opts_chunk$set(message = FALSE,error = FALSE,warning = FALSE,cache = FALSE,fig.width=8,fig.height=6, auto.dep=TRUE)

freq_expressed <- 0.2
FCTHRESHOLD <- log2(1.5)

data(maits, package='MAST')
mat <- t(maits$expressionmat)
dim(mat)

scaRaw <- FromMatrix(mat, maits$cdat, maits$fdat)

## Assuming data assay in position 1, with name et is log-transformed.

sceRaw <- SingleCellExperiment(assays=SimpleList(counts=mat), colData=colData(scaRaw), rowData=rowData(scaRaw))

all.equal(assay(scaRaw), assay(sceRaw))

## [1] TRUE

all.equal(colData(scaRaw), colData(sceRaw))

## [1] TRUE

all.equal(rowData(scaRaw), rowData(sceRaw))

## [1] TRUE

# heatmap doesn't work basically `NMF` package isn't aware of `DelayedArray::rowMeans`, and question is if there is anything I can do about it without asking `NMF` author to import `BiocGenerics::rowMeans`?
# aheatmap(assay(scaRaw[1:1000,]), labRow='', annCol=as.data.frame(colData(scaRaw)[,c('condition', 'ourfilter')]), distfun='spearman')
# aheatmap(assay(scaRaw.sce[1:1000,]), labRow='', annCol=as.data.frame(colData(scaRaw.sce)[,c('condition', 'ourfilter')]), distfun='spearman')

set.seed(123)
plotPCA <- function(sca_obj){
    projection <- rpca(t(assay(sca_obj)), retx=TRUE, k=4)$x
    colnames(projection)=c("PC1","PC2","PC3","PC4")
    pca <- data.table(projection,  as.data.frame(colData(sca_obj)))
    print(ggpairs(pca, columns=c('PC1', 'PC2', 'PC3', 'libSize', 'PercentToHuman', 'nGeneOn', 'exonRate'),
            mapping=aes(color=condition), upper=list(continuous='blank')))
    invisible(pca)
}

plotPCA(sceRaw)

filterCrit <- with(colData(sceRaw), pastFastqc=="PASS"& exonRate >0.3 & PercentToHuman>0.6 & nGeneOn> 4000)

filtering

sca <- subset(scaRaw,filterCrit)
sce <- subset(sceRaw,filterCrit)
all.equal(assay(sca), assay(sce))

## [1] TRUE

eid <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,keys = mcols(sca)$entrez,keytype ="GENEID",columns = c("GENEID","TXNAME"))

## 'select()' returned many:many mapping between keys and columns

ueid <- unique(na.omit(eid)$GENEID)

sca <- sca[mcols(sca)$entrez %in% ueid,]
sce <- sce[mcols(sce)$entrez %in% ueid,]
all.equal(assay(sca), assay(sce))

## [1] TRUE

## Remove invariant genes
set.seed(123)
sca <- sca[sample(which(freq(sca)>0), 6000),]
set.seed(123)
sce <- sce[sample(which(freq(sce)>0), 6000),]

all.equal(assay(sca), assay(sce))

## [1] TRUE

cdr2 <-colSums(assay(sca)>0)
cdr2.sce <-colSums(assay(sce)>0)
all.equal(cdr2, cdr2.sce)

## [1] TRUE

# qplot(x=cdr2, y=colData(sca)$nGeneOn) + xlab('New CDR') + ylab('Old CDR')

colData(sca)$cngeneson <- scale(cdr2)
colData(sce)$cngeneson <- scale(cdr2)

# plotPCA(sce)

set.seed(123)
scaSample <- sca[sample(which(freq(sca)>.1), 20),]
set.seed(123)
sceSample <- sce[sample(which(freq(sce)>.1), 20),]
all.equal(assay(scaSample), as.matrix(assay(sceSample)))

## [1] TRUE

flat <- as(scaSample, 'data.table')
flat.sce <- as(sceSample, 'data.table')
all.equal(flat, flat.sce)

## [1] TRUE

# ggplot(flat, aes(x=value))+geom_density() +facet_wrap(~symbolid, scale='free_y')

thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 20, min_per_bin = 30)
thres.sce <- thresholdSCRNACountMatrix(assay(sce), nbins = 20, min_per_bin = 30)

all.equal(thres, thres.sce)

assays(sca) <- list(thresh=thres$counts_threshold, tpm=assay(sca))
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]

assays(sce) <- list(thresh=thres.sce$counts_threshold, tpm=assay(sce))
expressed_genes.sce <- freq(sce) > freq_expressed
all.equal(expressed_genes, expressed_genes.sce)

## [1] TRUE

sce <- sce[expressed_genes.sce,]
all.equal(assay(sca), assay(sce))

## [1] TRUE

cond<-factor(colData(sca)$condition)
cond<-relevel(cond,"Unstim")
colData(sca)$condition<-cond
set.seed(123)
zlmCond <- zlm(~condition + cngeneson, sca)

## 
## Done!

cond.sce<-factor(colData(sce)$condition)
cond.sce<-relevel(cond.sce,"Unstim")
colData(sce)$condition<-cond.sce
set.seed(123)
zlmCond.sce <- zlm(~condition + cngeneson, sce)

## 
## Done!

all.equal(zlmCond, zlmCond.sce, check.attributes = F)

## [1] TRUE

# The following are equivalent
## lrt <- lrTest(zlm, "condition")
## lrt <- lrTest(zlm, CoefficientHypothesis('conditionStim'))

# This would test if 2*cngeneson=conditionStim
#  This is sheer nonsense biologically and statistically, but gives an example of the flexibility.
## lrt <- lrTest(zlm, Hypothesis('2*cngeneson-conditionStim'))

#only test the condition coefficient.
summaryCond <- summary(zlmCond, doLRT='conditionStim')

## Combining coefficients and standard errors

## Calculating log-fold changes

## Calculating likelihood ratio tests

## Refitting on reduced model...

## 
## Done!

summaryCond.sce <- summary(zlmCond.sce, doLRT='conditionStim')

## Combining coefficients and standard errors

## Calculating log-fold changes

## Calculating likelihood ratio tests

## Refitting on reduced model...

## 
## Done!

all.equal(summaryCond, summaryCond.sce, check.attributes = F)

## [1] TRUE

#print the top 4 genes by contrast using the logFC
print(summaryCond.sce, n=4)

## Fitted zlm with top 4 genes per contrast:
## ( log fold change Z-score )
##  primerid conditionStim cngeneson
##  3002       12.7*         -6.3*  
##  3458       13.5*         -7.2*  
##  3559       13.1*         -2.5   
##  51399       5.4          -6.2*  
##  7570       12.1*         -2.7   
##  9184        6.5          -5.0*

## by discrete Z-score
print(summaryCond.sce, n=4, by='D')

## Fitted zlm with top 4 genes per contrast:
## ( Wald Z-scores on discrete )
##  primerid conditionStim cngeneson
##  125061     -1.4           4.6*  
##  2058        6.1*         -4.1   
##  25906      -1.8           4.9*  
##  3559        6.2*         -3.9   
##  4729        6.2*         -3.5   
##  83451      -3.3           4.5*  
##  83882      -2.3           5.0*  
##  8894        6.1*         -4.1

## by continuous Z-score
print(summaryCond.sce, n=4, by='C')

## Fitted zlm with top 4 genes per contrast:
## ( Wald Z-scores tests on continuous )
##  primerid conditionStim cngeneson
##  1043       -7.4*         -0.6   
##  3002        9.2*         -6.1*  
##  3458       13.5*         -7.2*  
##  51399       6.2*         -8.8*  
##  7570        5.0          -6.2*

# 
# ## ------------------------------------------------------------------------
# summaryDt <- summaryCond$datatable
# fcHurdle <- merge(summaryDt[contrast=='conditionStim' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
#                       summaryDt[contrast=='conditionStim' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
# 
# fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# fcHurdleSig <- merge(fcHurdle[fdr<.05 & abs(coef)>FCTHRESHOLD], as.data.table(mcols(sca)), by='primerid')
# setorder(fcHurdleSig, fdr)
# 
# ## ---- fig.width=8,fig.height=8, dpi = 36---------------------------------
# entrez_to_plot <- fcHurdleSig[1:50,primerid]
# symbols_to_plot <- fcHurdleSig[1:50,symbolid]
# flat_dat <- as(sca[entrez_to_plot,], 'data.table')
# ggbase <- ggplot(flat_dat, aes(x=condition, y=thresh,color=condition)) + geom_jitter()+facet_wrap(~symbolid, scale='free_y')+ggtitle("DE Genes in Activated MAIT Cells")
# ggbase+geom_violin() 
# 
# ## ---- dpi = 36-----------------------------------------------------------
# flat_dat[,lmPred:=lm(thresh~cngeneson + condition)$fitted, key=symbolid]
# ggbase +aes(x=cngeneson) + geom_line(aes(y=lmPred), lty=1) + xlab('Standardized Cellular Detection Rate')
# 
# 
# ## ---- dpi = 36-----------------------------------------------------------
# ## This is all rather kludgy at the moment
# MM <- model.matrix(~condition,unique(colData(sca)[,c("condition"),drop=FALSE]))
# rownames(MM) <- str_extract(rownames(MM), 'Stim|Unstim')
# predicted <- predict(zlmCond,modelmatrix=MM)
# 
# ## Avert your eyes...
# predicted[, primerid:=as.character(primerid)]
# predicted_sig <- merge(mcols(sca), predicted[primerid%in%entrez_to_plot], by='primerid')
# predicted_sig <- as.data.table(predicted_sig)
# 
# ## plot with inverse logit transformed x-axis
# ggplot(predicted_sig)+aes(x=invlogit(etaD),y=muC,xse=seD,yse=seC,col=sample)+
#     facet_wrap(~symbolid,scales="free_y")+theme_linedraw()+
#     geom_point(size=0.5)+scale_x_continuous("Proportion expression")+
#     scale_y_continuous("Estimated Mean")+
#     stat_ell(aes(x=etaD,y=muC),level=0.95, invert='x')
# 
# 
# ## ---- fig.height=8-------------------------------------------------------
# mat_to_plot <- assay(sca[entrez_to_plot,])
# rownames(mat_to_plot) <- symbols_to_plot
# aheatmap(mat_to_plot,annCol=colData(sca)[,"condition"],main="DE genes",col=rev(colorRampPalette(colors = brewer.pal(name="PiYG",n=10))(20)))
# 
# 
# ## ------------------------------------------------------------------------
# table(colData(sca)$beta, exclude=NULL)
# #Note that we currently throw an uninformative error if a covariate is `NA`
# scaHasBeta <- subset(sca, !is.na(beta))
# 
# ## ----residuals-----------------------------------------------------------
# scaDE <- sca[entrez_to_plot,]
# zlmResidDE <- zlm(~condition + cngeneson, scaDE, hook=deviance_residuals_hook)
# residDE <- zlmResidDE@hookOut
# residDEMatrix <- do.call(rbind, residDE)
# 
# ## ----addResiduals, dpi = 36----------------------------------------------
# assays(scaDE) <- c(assays(scaDE), list(resid=residDEMatrix))
# scaResidFlat <- as(scaDE, 'data.table')
# scaResidFlat[1:4,]
# ggplot(scaResidFlat, aes(x=ngeneson, y=resid))+geom_point(aes(col=condition))+geom_smooth()+facet_wrap(~symbolid)
# 
# 
# ## ----boots, eval=TRUE----------------------------------------------------
# # bootstrap, resampling cells
# # R should be set to >50 if you were doing this for real.
# boots <- bootVcov1(zlmCond, R=4)
# 
# ## ----setupBTM, dependson="data;zlm;boots",results='hide'-----------------
# module <- "BTM"
# min_gene_in_module <- 5
# packageExt <- system.file("extdata", package='MAST')
# module_file <- list.files(packageExt, pattern = module, full.names = TRUE)
# gene_set <- getGmt(module_file)
# gene_ids <- geneIds(gene_set)
# gene_ids <- gene_ids[!names(gene_ids)%like%"TBA"&!names(gene_ids)%like%"B cell"]
# sets_indices <- limma::ids2indices(gene_ids, mcols(sca)$symbolid)
# # Only keep modules with at least min_gene_in_module
# sets_indices <- sets_indices[sapply(sets_indices, length) >= min_gene_in_module]
# 
# 
# ## ----gsea----------------------------------------------------------------
# gsea <- gseaAfterBoot(zlmCond, boots, sets_indices, CoefficientHypothesis("conditionStim")) 
# z_stat_comb <- summary(gsea, testType='normal')
# 
# ## ----gseaView------------------------------------------------------------
# sigModules <- z_stat_comb[combined_adj<.01]
# gseaTable <- melt(sigModules[,.(set, disc_Z, cont_Z, combined_Z)], id.vars='set')
# ggplot(gseaTable, aes(y=set, x=variable, fill=value))+geom_raster() + scale_fill_distiller(palette="PiYG")
#

MAITAnalysis_sce.R

wjiang2

Fri Feb 1 12:48:46 2019