Preparation

Set the threshold parameter

freq_expressed <- 0.2 # Threshold for expression frequency
FCTHRESHOLD <- log2(1.5)

Load the dataset

Load the MAIT data

The MAIT data is used as the illustrative dataset in the Coding page for the MAST package MAIT data

data(maits,package = "MAST")
MAIT <- maits
dim(maits$expressionmat)
## [1]    96 16302
dim(maits$fdat)
## [1] 16302     3
dim(maits$cdat)
## [1] 96 21
scaRaw <- FromMatrix(t(maits$expressionmat),maits$cdat,maits$fdat)

Filter the MAIT data and discard those with low expression rate

filterCrit <- with(colData(scaRaw),pastFastqc=="PASS"& exonRate >0.3 & PercentToHuman >0.6 & nGeneOn>4000)
sca <- subset(scaRaw,filterCrit)
#eid <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,keys = mcols(sca)$entrez,keytype = "GENEID",columns=c("GENEID","TXNAME"))
#ueid <- unique(na.omit(eid)$GENEID)
#sca <- sca[mcols(sca)$entrez %in% ueid,]

#sca <- sca[sample(which(freq(sca)>0),6000),]
#cdr2 <- colSums(assay(sca)>0)
#qplot(x=cdr2,y=colData(sca)$nGeneOn )+ xlab("New CDR")+ ylab('Old CDR')

Load the SCEdata subset

There is a problem when I try to load the complete SCEdata because of the memory issue.

library(data.table)
SCEdata <- readRDS("GSE60361_SCESet_subset.rds")
CSCEdata <- readRDS("GSE60361_SCESet.rds")
summary(SCEdata)
## Length  Class   Mode 
##      1 SCESet     S4
pdata <- pData(SCEdata)
expres <- exprs(SCEdata)
fdata = data.frame(Gene = rownames(expres))
rownames(fdata) = fdata$Gene
SCE <- FromMatrix(expres, pdata, fdata)

#cpdata <- pData(CSCEdata)
#cexpres <- exprs(CSCEdata)
#cfdata <- data.frame(Gene=rownames(cexpres))
#cscaRaw <- FromMatrix(cexpres, cpdata, cfdata)

Filter the SCEdata set (We need to choose variables)

For the MAIT data, we filter the original data by “pastFastqc”,“exonrate”,“percentToHuman” and “nGeneOn”. In the target dataset for SCToolkit, ideally, we may let the users customize their filter options by themselves. (How to do that? We capture the desired variables as well as the range) So we will leave this Part at this stage and add it to the shiny app later.

#filtercrit <- with(colData(scaRaw),pastFastqc=="")
#str(scaRaw,1000)
#sca <- subset(scaRaw,filterCrit)

Exploratory data analysis (For SCEdata)

Heatmap

aheatmap(assay(SCE[1:100,]),labRow = '',annCol = as.data.frame(colData(SCE)[,c('total.mRNA.mol', 'level1class')]),distfun='spearman',width = 10,height = 10)

PCA

set.seed(10)
plotPCA <- function(sca_obj){
  projection <- rpca(t(assay(sca_obj)),retx = TRUE,k=4)$x
  pca <- data.table(projection,as.data.frame(colData(sca_obj)))
  print(ggpairs(pca,columns = c('PC1','PC2','tissue', 'group', 'total.mRNA.mol', 'well'),
                mapping = aes(color=tissue  ),upper=list(continuous='blank')))
  invisible(pca)
}
plotPCA(SCE)

head(expres)
##          X1772063061_G11 X1772067064_C10 X1772062113_D09 X1772062111_A12
## Tspan12         6.162698        6.162698        6.162698        6.162698
## Tshz1           6.162698        6.162698        7.211905        6.162698
## Fnbp1l          6.162698        6.162698        7.211905        6.162698
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        7.337548        6.162698        6.162698
## Rxfp1           6.162698        6.162698        6.162698        6.162698
##          X1772071040_C12 X1772062111_D09 X1772071036_G02 X1772071040_D04
## Tspan12         7.110472        6.162698        6.162698        6.162698
## Tshz1           7.677597        6.162698        7.302883        6.162698
## Fnbp1l          8.083633        6.162698        6.162698        6.162698
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        6.162698        6.162698        6.162698
## Rxfp1           7.110472        6.162698        6.162698        6.162698
##          X1772058177_E12 X1772066070_G10 X1772063074_G10 X1772062115_E07
## Tspan12         6.162698        6.162698        6.162698        6.162698
## Tshz1           7.210935        6.162698        6.162698        6.162698
## Fnbp1l          6.162698        7.507985        6.162698        6.162698
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        7.507985        6.162698        6.162698
## Rxfp1           6.162698        6.162698        6.162698        6.162698
##          X1772062109_F01 X1772060226_G05 X1772066108_D06 X1772060224_F07
## Tspan12         6.162698        6.162698        6.162698        6.162698
## Tshz1           6.162698        6.162698        6.162698        6.162698
## Fnbp1l          6.162698        7.969989        6.162698        6.162698
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        6.162698        7.882452        6.162698
## Rxfp1           6.162698        6.162698        6.162698        6.162698
##          X1772062111_F12 X1772063078_G02 X1772060240_H11 X1772067094_C07
## Tspan12         6.162698        6.162698        6.162698        6.162698
## Tshz1          10.453699        6.162698        6.162698        6.162698
## Fnbp1l          6.162698        8.629441        6.162698        6.162698
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        6.162698        6.162698        6.162698
## Rxfp1           6.162698        6.162698        6.162698        6.162698
##          X1772058177_A09 X1772058171_C05 X1772062128_B03 X1772062118_F04
## Tspan12         6.162698        6.162698        6.162698        6.162698
## Tshz1           6.162698        8.744934        6.162698        6.162698
## Fnbp1l          6.162698        6.162698        6.162698        6.162698
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        6.162698        6.162698        6.162698
## Rxfp1           6.162698        6.162698        6.162698        6.162698
##          X1772062109_E05 X1772062128_D02 X1772062109_E12 X1772062113_E05
## Tspan12         6.162698        6.162698        6.162698        7.731217
## Tshz1           6.162698        8.065736        8.907126        6.162698
## Fnbp1l          6.162698        6.162698        6.162698        7.731217
## Adamts15        6.162698        6.162698        6.162698        6.162698
## Cldn12          6.162698        6.162698        6.162698        6.162698
## Rxfp1           6.162698        6.162698        6.162698        6.162698
##          X1772062128_F05 X1772062118_D09
## Tspan12         6.162698        6.162698
## Tshz1           6.162698        6.162698
## Fnbp1l          6.162698        6.162698
## Adamts15        6.162698        6.162698
## Cldn12          6.162698        6.162698
## Rxfp1           6.162698        6.162698
# Notice that in a Single cell dataset, it is well known to be #zero-inflated and bimodal. However, in our case, since the SCEdata
#has already normalized the expression, it is not zero-inflated.
median(expres) # The 6.162698 is equivalent to 0 in this case
## [1] 6.162698
SCEsample1 <- sample(SCE,50) # Simply get a sample with size 50 from SCE
flat1 <- as(SCEsample1,'data.table')
ggplot(flat1,aes(x=value))+geom_density()+facet_wrap(~Gene,scale='free_y')

# The data are 6.162698 inflated rather than 0
expres_new <- expres-median(expres)
SCE <- FromMatrix(expres_new, pdata, fdata)
# Notice that here, after the expression matrix subtract 6.162698, the remaining non-zero entries are extremely small. Is it because of the large standard deviation used for normalization? 

Addaptive thresholding

# Here we try to adapt a threshold for low expression gene.

# Freq function calculate the percentage of cells expressed for
# a single gene??

# We want the expression rate larger than 0.1.
SCEsample2 <- SCE[sample(which(freq(SCE)>0.1),50),]
flat2 <- as(SCEsample2,'data.table')
ggplot(flat2,aes(x=value))+geom_density()+facet_wrap(~Gene,scale='free_y')

# Look nicer now

thres <- thresholdSCRNACountMatrix(assay(SCE),nbins = 20,min_per_bin = 30)
## (-1e-10,0.126]  (0.126,0.428]  (0.428,0.608]  (0.608,0.811]   (0.811,1.04] 
##      0.5175085      0.5175085      0.5558732      0.7867676      0.4962100 
##     (1.04,1.3]     (1.3,1.59]    (1.59,1.91]    (1.91,2.28]    (2.28,2.69] 
##      0.4926872      0.5121377      0.5157759      0.5334249      0.5605892 
##    (2.69,3.16]    (3.16,3.68]    (3.68,4.27]    (4.27,9.75] 
##      0.6090587      0.6683612      0.7912240      0.8658974
par(mfrow=c(4,4))
plot(thres)

Cited from MAST: “We suspect that the left-most mode corresponds to non-specific hybridization of mRNA or genomic DNA”. I think it addaptive to the SCEdata too. Consequently, we only need to gene with higher expression rate for further analysis.

assays(SCE) <- list(thresh=thres$counts_threshold,tpm=assay(SCE))
expressed_genes <- freq(SCE)>freq_expressed
SCE <- SCE[expressed_genes,]
thres <- thresholdSCRNACountMatrix(assay(SCE),nbins = 20,min_per_bin = 30)
## (1.05,1.23] (1.23,1.42] (1.42,1.63] (1.63,1.85]  (1.85,2.1]  (2.1,2.37] 
##    2.183890    2.183890    2.183890    2.183890    2.183890    2.183890 
## (2.37,2.66] (2.66,2.98] (2.98,3.32] (3.32,3.69]  (3.69,4.1]  (4.1,9.75] 
##    2.183890    2.183890    2.183890    2.183890    2.327815    8.874137
par(mfrow=c(5,4))
plot(thres)

Defferential expression using a hurdle model

In MAITS data, it models the condition (stim&unstim) and (centered) ngeneson factor to adjust for the cellular detection rate. Noticed that here the reference level for condition is “unstimulated”. Here comes the question, what is the meaning of “centered” here? I try to use the rowsum(maits$expressionmat) to calculate the number of expressed genes for a cell but can’t reproduce the result.

pdata$nGeneOn <- colSums(expres)
SCE_new <- FromMatrix(expres_new, pdata, fdata)
SCE_new <- SCE_new[expressed_genes,]
SCE_new_sample <- SCE_new[sample(which(freq(SCE_new)>0.1),50),]

tissue <- factor(colData(SCE_new_sample)$tissue)
colData(SCE_new_sample)$tissue <- tissue 
zlmtissue <- zlm.SingleCellAssay(~tissue+nGeneOn,SCE_new_sample)
summarytissue <- summary(zlmtissue,doLRT='tissuesscortex')

# We can print the top 5 genes by contrast using the logFC or
# discrete Z-score or continuous Z-score 
print(summarytissue,5)
## Fitted zlm with top 5 genes per contrast:
## ( log fold change Z-score )
##  primerid tissuesscortex nGeneOn
##  Csnk1e     -0.2*          -0.8*
##  Fcgr3       2.2*          -0.5*
##  Kxd1       -0.3*           0.3*
##  Marcks      0.1*          -2.9*
##  Tgoln1     -0.1*          -0.2*
print(summarytissue,5,by="D")
## Fitted zlm with top 5 genes per contrast:
## ( Wald Z-scores on discrete )
##  primerid      tissuesscortex nGeneOn
##  0610031J06Rik    1.7*           0.4 
##  Csnk1a1         -1.4*           0.3 
##  Marcks          -1.2*          -0.9 
##  Mir682           0.3            1.6*
##  Mrpl51          -0.1            2.1*
##  Pea15a           0.3            1.7*
##  Plekhh1          1.2*           0.1 
##  Psmb2            0.3            2.3*
##  Rps8             0.6            2.1*
##  Smc1a            1.3*           1.5
print(summarytissue,5,by="C")
## Fitted zlm with top 5 genes per contrast:
## ( Wald Z-scores tests on continuous )
##  primerid tissuesscortex nGeneOn
##  Csnk1a1    -1.5*          -1.1 
##  Dusp1      -0.6           -2.6*
##  Fcgr3       2.2*          -0.5 
##  Marcks      0.1           -2.9*
##  Mfn1        0.3            2.0*
##  Mrpl51      1.0            2.3*
##  Psmb2       1.3*           0.7 
##  Smc1a       1.3*           0.6 
##  Spred1     -0.5            1.9*
##  Tgoln1     -1.7*          -0.4
# We can also run a likelihood ratio test for models droping the
# given factors.

A better way out, make a giant data.table

summaryDT <- summarytissue$datatable
Hurdle <- merge(summaryDT[contrast=='tissuesscortex'&component=='H',.(primerid,`Pr(>Chisq)`)],summaryDT[contrast=='tissuesscortex'&component=='logFC',.(primerid,coef,ci.hi,ci.lo)],by='primerid')

# A sample
head(Hurdle[,fdr:=p.adjust(`Pr(>Chisq)`,'fdr')],100)
##          primerid Pr(>Chisq)          coef        ci.hi         ci.lo
##  1: 0610009D07Rik  0.4750551  1.016834e-30 1.098883e-27 -1.096850e-27
##  2: 0610031J06Rik  0.1692712  2.020406e-07 2.254714e-05 -2.214306e-05
##  3: 4930526I15Rik  0.4639990  3.234863e+00 3.970243e+02 -3.905546e+02
##  4:          Abat  0.4802472 -1.543739e-03 1.453652e+00 -1.456740e+00
##  5:         Abhd5  0.6533354  1.130437e-13 3.498532e-11 -3.475923e-11
##  6:           Abr  0.4282090 -1.625525e-16 1.788956e-14 -1.821467e-14
##  7:         Adrb1  0.5809814 -3.556503e-11 4.298758e-08 -4.305871e-08
##  8:         Asxl2  0.9634626  6.679813e-05 7.038892e-03 -6.905295e-03
##  9:        Atxn10  0.6990038 -9.159424e-16 1.678546e-13 -1.696865e-13
## 10:         Clic4  0.9418289  1.914838e-27 5.129136e-25 -5.090839e-25
## 11:         Cops5  0.9029959  5.847402e-32 7.251898e-30 -7.134950e-30
## 12:       Csnk1a1  0.1018241 -9.631072e-05 1.130481e-02 -1.149743e-02
## 13:        Csnk1e  0.8703142 -2.457720e-01 2.455327e+00 -2.946871e+00
## 14:          Cul5  0.5330835 -8.897884e-05 2.408462e-02 -2.426258e-02
## 15:          Dstn  0.7535223 -1.405888e-26 2.466568e-24 -2.494685e-24
## 16:         Dusp1  0.3420144 -3.191695e-05 3.294878e-03 -3.358712e-03
## 17:          Ehd4  0.9619238 -1.829905e-02 5.182279e+00 -5.218877e+00
## 18:        Fam50a  0.8603897 -2.593705e-02 4.265724e+00 -4.317598e+00
## 19:         Fcgr3  0.3207652  2.630089e+00 4.974711e+00  2.854676e-01
## 20:          Hhip  0.8407467  5.067115e-05 5.360449e-03 -5.259107e-03
## 21:         Kdm5c  0.8626728 -4.366516e-26 1.281980e-23 -1.290713e-23
## 22:         Kif3c  0.6849190  4.058279e-05 6.474256e-03 -6.393090e-03
## 23:          Kxd1  0.9280728 -5.365538e-01 3.230743e+00 -4.303851e+00
## 24:        Marcks  0.2577702  6.902130e-02 1.538937e+00 -1.400894e+00
## 25:         Mcts1  0.8560380 -1.711800e-11 1.958286e-09 -1.992522e-09
## 26:          Mfn1  0.3873438 -2.267765e-22 2.291821e-20 -2.337176e-20
## 27:         Micu1  0.9226237  2.420298e-04 4.923229e-02 -4.874823e-02
## 28:        Mir682  0.9036782 -1.063150e-33 1.182109e-31 -1.203372e-31
## 29:        Mrpl51  0.7662496  1.027883e-47 1.252303e-45 -1.231746e-45
## 30:        Ndufb3  0.8726596 -5.789674e-32 1.029382e-29 -1.040961e-29
## 31:        Pea15a  0.6244466  4.269655e-39 1.108934e-36 -1.100394e-36
## 32:       Plekhh1  0.2981824 -6.294926e-02 8.078897e+00 -8.204796e+00
## 33:         Psmb2  0.5809874 -2.133274e-57 4.928456e-55 -4.971121e-55
## 34:          Rps8  0.6522889 -5.973655e-56 9.287918e-54 -9.407391e-54
## 35:         Scn8a  0.8821164 -6.395409e-24 6.405901e-22 -6.533809e-22
## 36:         Setd8  0.9733859 -1.743011e-30 1.761610e-28 -1.796471e-28
## 37:         Smc1a  0.2287321 -3.220855e-31 6.040827e-29 -6.105244e-29
## 38:         Snhg9  0.2946691  1.289446e-09 1.580629e-07 -1.554840e-07
## 39:        Spred1  0.9378469  2.720893e-04 3.024604e-02 -2.970186e-02
## 40:         Stmn1  0.9654608  5.815127e-30 7.697017e-28 -7.580714e-28
## 41:        Tgoln1  0.5152793 -1.326362e+00 1.835522e+01 -2.100794e+01
## 42:          Tlk1  0.7640892  2.703688e-15 3.932912e-13 -3.878838e-13
## 43:      Tmem184b  0.8618506 -5.367958e-16 1.029059e-12 -1.030133e-12
## 44:          Tmx1  0.5806770  4.805034e-16 8.595864e-14 -8.499763e-14
## 45:         Tnks2  0.4062025 -2.013906e-27 2.384359e-25 -2.424638e-25
## 46:        Tnrc6c  0.9636313 -8.209821e-17 1.008796e-14 -1.025216e-14
## 47:         Tra2b  0.8040551  5.456255e-13 1.029543e-10 -1.018631e-10
## 48:         Ttc14  0.9457661 -2.917359e-28 3.730229e-26 -3.788576e-26
## 49:       Unc93b1  0.6918524 -1.652286e-06 1.721162e-04 -1.754208e-04
## 50:        Zfp148  0.7535605 -2.006690e-21 2.018380e-19 -2.058514e-19
##          primerid Pr(>Chisq)          coef        ci.hi         ci.lo
##           fdr
##  1: 0.9733859
##  2: 0.9733859
##  3: 0.9733859
##  4: 0.9733859
##  5: 0.9733859
##  6: 0.9733859
##  7: 0.9733859
##  8: 0.9733859
##  9: 0.9733859
## 10: 0.9733859
## 11: 0.9733859
## 12: 0.9733859
## 13: 0.9733859
## 14: 0.9733859
## 15: 0.9733859
## 16: 0.9733859
## 17: 0.9733859
## 18: 0.9733859
## 19: 0.9733859
## 20: 0.9733859
## 21: 0.9733859
## 22: 0.9733859
## 23: 0.9733859
## 24: 0.9733859
## 25: 0.9733859
## 26: 0.9733859
## 27: 0.9733859
## 28: 0.9733859
## 29: 0.9733859
## 30: 0.9733859
## 31: 0.9733859
## 32: 0.9733859
## 33: 0.9733859
## 34: 0.9733859
## 35: 0.9733859
## 36: 0.9733859
## 37: 0.9733859
## 38: 0.9733859
## 39: 0.9733859
## 40: 0.9733859
## 41: 0.9733859
## 42: 0.9733859
## 43: 0.9733859
## 44: 0.9733859
## 45: 0.9733859
## 46: 0.9733859
## 47: 0.9733859
## 48: 0.9733859
## 49: 0.9733859
## 50: 0.9733859
##           fdr
# sth wrong here, need to be fixed