freq_expressed <- 0.2 # Threshold for expression frequency
FCTHRESHOLD <- log2(1.5)
The MAIT data is used as the illustrative dataset in the Coding page for the MAST package MAIT data
data(maits,package = "MAST")
MAIT <- maits
dim(maits$expressionmat)
## [1] 96 16302
dim(maits$fdat)
## [1] 16302 3
dim(maits$cdat)
## [1] 96 21
scaRaw <- FromMatrix(t(maits$expressionmat),maits$cdat,maits$fdat)
filterCrit <- with(colData(scaRaw),pastFastqc=="PASS"& exonRate >0.3 & PercentToHuman >0.6 & nGeneOn>4000)
sca <- subset(scaRaw,filterCrit)
#eid <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,keys = mcols(sca)$entrez,keytype = "GENEID",columns=c("GENEID","TXNAME"))
#ueid <- unique(na.omit(eid)$GENEID)
#sca <- sca[mcols(sca)$entrez %in% ueid,]
#sca <- sca[sample(which(freq(sca)>0),6000),]
#cdr2 <- colSums(assay(sca)>0)
#qplot(x=cdr2,y=colData(sca)$nGeneOn )+ xlab("New CDR")+ ylab('Old CDR')
There is a problem when I try to load the complete SCEdata because of the memory issue.
library(data.table)
SCEdata <- readRDS("GSE60361_SCESet_subset.rds")
CSCEdata <- readRDS("GSE60361_SCESet.rds")
summary(SCEdata)
## Length Class Mode
## 1 SCESet S4
pdata <- pData(SCEdata)
expres <- exprs(SCEdata)
fdata = data.frame(Gene = rownames(expres))
rownames(fdata) = fdata$Gene
SCE <- FromMatrix(expres, pdata, fdata)
#cpdata <- pData(CSCEdata)
#cexpres <- exprs(CSCEdata)
#cfdata <- data.frame(Gene=rownames(cexpres))
#cscaRaw <- FromMatrix(cexpres, cpdata, cfdata)
For the MAIT data, we filter the original data by “pastFastqc”,“exonrate”,“percentToHuman” and “nGeneOn”. In the target dataset for SCToolkit, ideally, we may let the users customize their filter options by themselves. (How to do that? We capture the desired variables as well as the range) So we will leave this Part at this stage and add it to the shiny app later.
#filtercrit <- with(colData(scaRaw),pastFastqc=="")
#str(scaRaw,1000)
#sca <- subset(scaRaw,filterCrit)
aheatmap(assay(SCE[1:100,]),labRow = '',annCol = as.data.frame(colData(SCE)[,c('total.mRNA.mol', 'level1class')]),distfun='spearman',width = 10,height = 10)
set.seed(10)
plotPCA <- function(sca_obj){
projection <- rpca(t(assay(sca_obj)),retx = TRUE,k=4)$x
pca <- data.table(projection,as.data.frame(colData(sca_obj)))
print(ggpairs(pca,columns = c('PC1','PC2','tissue', 'group', 'total.mRNA.mol', 'well'),
mapping = aes(color=tissue ),upper=list(continuous='blank')))
invisible(pca)
}
plotPCA(SCE)
head(expres)
## X1772063061_G11 X1772067064_C10 X1772062113_D09 X1772062111_A12
## Tspan12 6.162698 6.162698 6.162698 6.162698
## Tshz1 6.162698 6.162698 7.211905 6.162698
## Fnbp1l 6.162698 6.162698 7.211905 6.162698
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 7.337548 6.162698 6.162698
## Rxfp1 6.162698 6.162698 6.162698 6.162698
## X1772071040_C12 X1772062111_D09 X1772071036_G02 X1772071040_D04
## Tspan12 7.110472 6.162698 6.162698 6.162698
## Tshz1 7.677597 6.162698 7.302883 6.162698
## Fnbp1l 8.083633 6.162698 6.162698 6.162698
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 6.162698 6.162698 6.162698
## Rxfp1 7.110472 6.162698 6.162698 6.162698
## X1772058177_E12 X1772066070_G10 X1772063074_G10 X1772062115_E07
## Tspan12 6.162698 6.162698 6.162698 6.162698
## Tshz1 7.210935 6.162698 6.162698 6.162698
## Fnbp1l 6.162698 7.507985 6.162698 6.162698
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 7.507985 6.162698 6.162698
## Rxfp1 6.162698 6.162698 6.162698 6.162698
## X1772062109_F01 X1772060226_G05 X1772066108_D06 X1772060224_F07
## Tspan12 6.162698 6.162698 6.162698 6.162698
## Tshz1 6.162698 6.162698 6.162698 6.162698
## Fnbp1l 6.162698 7.969989 6.162698 6.162698
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 6.162698 7.882452 6.162698
## Rxfp1 6.162698 6.162698 6.162698 6.162698
## X1772062111_F12 X1772063078_G02 X1772060240_H11 X1772067094_C07
## Tspan12 6.162698 6.162698 6.162698 6.162698
## Tshz1 10.453699 6.162698 6.162698 6.162698
## Fnbp1l 6.162698 8.629441 6.162698 6.162698
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 6.162698 6.162698 6.162698
## Rxfp1 6.162698 6.162698 6.162698 6.162698
## X1772058177_A09 X1772058171_C05 X1772062128_B03 X1772062118_F04
## Tspan12 6.162698 6.162698 6.162698 6.162698
## Tshz1 6.162698 8.744934 6.162698 6.162698
## Fnbp1l 6.162698 6.162698 6.162698 6.162698
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 6.162698 6.162698 6.162698
## Rxfp1 6.162698 6.162698 6.162698 6.162698
## X1772062109_E05 X1772062128_D02 X1772062109_E12 X1772062113_E05
## Tspan12 6.162698 6.162698 6.162698 7.731217
## Tshz1 6.162698 8.065736 8.907126 6.162698
## Fnbp1l 6.162698 6.162698 6.162698 7.731217
## Adamts15 6.162698 6.162698 6.162698 6.162698
## Cldn12 6.162698 6.162698 6.162698 6.162698
## Rxfp1 6.162698 6.162698 6.162698 6.162698
## X1772062128_F05 X1772062118_D09
## Tspan12 6.162698 6.162698
## Tshz1 6.162698 6.162698
## Fnbp1l 6.162698 6.162698
## Adamts15 6.162698 6.162698
## Cldn12 6.162698 6.162698
## Rxfp1 6.162698 6.162698
# Notice that in a Single cell dataset, it is well known to be #zero-inflated and bimodal. However, in our case, since the SCEdata
#has already normalized the expression, it is not zero-inflated.
median(expres) # The 6.162698 is equivalent to 0 in this case
## [1] 6.162698
SCEsample1 <- sample(SCE,50) # Simply get a sample with size 50 from SCE
flat1 <- as(SCEsample1,'data.table')
ggplot(flat1,aes(x=value))+geom_density()+facet_wrap(~Gene,scale='free_y')
# The data are 6.162698 inflated rather than 0
expres_new <- expres-median(expres)
SCE <- FromMatrix(expres_new, pdata, fdata)
# Notice that here, after the expression matrix subtract 6.162698, the remaining non-zero entries are extremely small. Is it because of the large standard deviation used for normalization?
# Here we try to adapt a threshold for low expression gene.
# Freq function calculate the percentage of cells expressed for
# a single gene??
# We want the expression rate larger than 0.1.
SCEsample2 <- SCE[sample(which(freq(SCE)>0.1),50),]
flat2 <- as(SCEsample2,'data.table')
ggplot(flat2,aes(x=value))+geom_density()+facet_wrap(~Gene,scale='free_y')
# Look nicer now
thres <- thresholdSCRNACountMatrix(assay(SCE),nbins = 20,min_per_bin = 30)
## (-1e-10,0.126] (0.126,0.428] (0.428,0.608] (0.608,0.811] (0.811,1.04]
## 0.5175085 0.5175085 0.5558732 0.7867676 0.4962100
## (1.04,1.3] (1.3,1.59] (1.59,1.91] (1.91,2.28] (2.28,2.69]
## 0.4926872 0.5121377 0.5157759 0.5334249 0.5605892
## (2.69,3.16] (3.16,3.68] (3.68,4.27] (4.27,9.75]
## 0.6090587 0.6683612 0.7912240 0.8658974
par(mfrow=c(4,4))
plot(thres)
Cited from MAST: “We suspect that the left-most mode corresponds to non-specific hybridization of mRNA or genomic DNA”. I think it addaptive to the SCEdata too. Consequently, we only need to gene with higher expression rate for further analysis.
assays(SCE) <- list(thresh=thres$counts_threshold,tpm=assay(SCE))
expressed_genes <- freq(SCE)>freq_expressed
SCE <- SCE[expressed_genes,]
thres <- thresholdSCRNACountMatrix(assay(SCE),nbins = 20,min_per_bin = 30)
## (1.05,1.23] (1.23,1.42] (1.42,1.63] (1.63,1.85] (1.85,2.1] (2.1,2.37]
## 2.183890 2.183890 2.183890 2.183890 2.183890 2.183890
## (2.37,2.66] (2.66,2.98] (2.98,3.32] (3.32,3.69] (3.69,4.1] (4.1,9.75]
## 2.183890 2.183890 2.183890 2.183890 2.327815 8.874137
par(mfrow=c(5,4))
plot(thres)
In MAITS data, it models the condition (stim&unstim) and (centered) ngeneson factor to adjust for the cellular detection rate. Noticed that here the reference level for condition is “unstimulated”. Here comes the question, what is the meaning of “centered” here? I try to use the rowsum(maits$expressionmat) to calculate the number of expressed genes for a cell but can’t reproduce the result.
pdata$nGeneOn <- colSums(expres)
SCE_new <- FromMatrix(expres_new, pdata, fdata)
SCE_new <- SCE_new[expressed_genes,]
SCE_new_sample <- SCE_new[sample(which(freq(SCE_new)>0.1),50),]
tissue <- factor(colData(SCE_new_sample)$tissue)
colData(SCE_new_sample)$tissue <- tissue
zlmtissue <- zlm.SingleCellAssay(~tissue+nGeneOn,SCE_new_sample)
summarytissue <- summary(zlmtissue,doLRT='tissuesscortex')
# We can print the top 5 genes by contrast using the logFC or
# discrete Z-score or continuous Z-score
print(summarytissue,5)
## Fitted zlm with top 5 genes per contrast:
## ( log fold change Z-score )
## primerid tissuesscortex nGeneOn
## Csnk1e -0.2* -0.8*
## Fcgr3 2.2* -0.5*
## Kxd1 -0.3* 0.3*
## Marcks 0.1* -2.9*
## Tgoln1 -0.1* -0.2*
print(summarytissue,5,by="D")
## Fitted zlm with top 5 genes per contrast:
## ( Wald Z-scores on discrete )
## primerid tissuesscortex nGeneOn
## 0610031J06Rik 1.7* 0.4
## Csnk1a1 -1.4* 0.3
## Marcks -1.2* -0.9
## Mir682 0.3 1.6*
## Mrpl51 -0.1 2.1*
## Pea15a 0.3 1.7*
## Plekhh1 1.2* 0.1
## Psmb2 0.3 2.3*
## Rps8 0.6 2.1*
## Smc1a 1.3* 1.5
print(summarytissue,5,by="C")
## Fitted zlm with top 5 genes per contrast:
## ( Wald Z-scores tests on continuous )
## primerid tissuesscortex nGeneOn
## Csnk1a1 -1.5* -1.1
## Dusp1 -0.6 -2.6*
## Fcgr3 2.2* -0.5
## Marcks 0.1 -2.9*
## Mfn1 0.3 2.0*
## Mrpl51 1.0 2.3*
## Psmb2 1.3* 0.7
## Smc1a 1.3* 0.6
## Spred1 -0.5 1.9*
## Tgoln1 -1.7* -0.4
# We can also run a likelihood ratio test for models droping the
# given factors.
A better way out, make a giant data.table
summaryDT <- summarytissue$datatable
Hurdle <- merge(summaryDT[contrast=='tissuesscortex'&component=='H',.(primerid,`Pr(>Chisq)`)],summaryDT[contrast=='tissuesscortex'&component=='logFC',.(primerid,coef,ci.hi,ci.lo)],by='primerid')
# A sample
head(Hurdle[,fdr:=p.adjust(`Pr(>Chisq)`,'fdr')],100)
## primerid Pr(>Chisq) coef ci.hi ci.lo
## 1: 0610009D07Rik 0.4750551 1.016834e-30 1.098883e-27 -1.096850e-27
## 2: 0610031J06Rik 0.1692712 2.020406e-07 2.254714e-05 -2.214306e-05
## 3: 4930526I15Rik 0.4639990 3.234863e+00 3.970243e+02 -3.905546e+02
## 4: Abat 0.4802472 -1.543739e-03 1.453652e+00 -1.456740e+00
## 5: Abhd5 0.6533354 1.130437e-13 3.498532e-11 -3.475923e-11
## 6: Abr 0.4282090 -1.625525e-16 1.788956e-14 -1.821467e-14
## 7: Adrb1 0.5809814 -3.556503e-11 4.298758e-08 -4.305871e-08
## 8: Asxl2 0.9634626 6.679813e-05 7.038892e-03 -6.905295e-03
## 9: Atxn10 0.6990038 -9.159424e-16 1.678546e-13 -1.696865e-13
## 10: Clic4 0.9418289 1.914838e-27 5.129136e-25 -5.090839e-25
## 11: Cops5 0.9029959 5.847402e-32 7.251898e-30 -7.134950e-30
## 12: Csnk1a1 0.1018241 -9.631072e-05 1.130481e-02 -1.149743e-02
## 13: Csnk1e 0.8703142 -2.457720e-01 2.455327e+00 -2.946871e+00
## 14: Cul5 0.5330835 -8.897884e-05 2.408462e-02 -2.426258e-02
## 15: Dstn 0.7535223 -1.405888e-26 2.466568e-24 -2.494685e-24
## 16: Dusp1 0.3420144 -3.191695e-05 3.294878e-03 -3.358712e-03
## 17: Ehd4 0.9619238 -1.829905e-02 5.182279e+00 -5.218877e+00
## 18: Fam50a 0.8603897 -2.593705e-02 4.265724e+00 -4.317598e+00
## 19: Fcgr3 0.3207652 2.630089e+00 4.974711e+00 2.854676e-01
## 20: Hhip 0.8407467 5.067115e-05 5.360449e-03 -5.259107e-03
## 21: Kdm5c 0.8626728 -4.366516e-26 1.281980e-23 -1.290713e-23
## 22: Kif3c 0.6849190 4.058279e-05 6.474256e-03 -6.393090e-03
## 23: Kxd1 0.9280728 -5.365538e-01 3.230743e+00 -4.303851e+00
## 24: Marcks 0.2577702 6.902130e-02 1.538937e+00 -1.400894e+00
## 25: Mcts1 0.8560380 -1.711800e-11 1.958286e-09 -1.992522e-09
## 26: Mfn1 0.3873438 -2.267765e-22 2.291821e-20 -2.337176e-20
## 27: Micu1 0.9226237 2.420298e-04 4.923229e-02 -4.874823e-02
## 28: Mir682 0.9036782 -1.063150e-33 1.182109e-31 -1.203372e-31
## 29: Mrpl51 0.7662496 1.027883e-47 1.252303e-45 -1.231746e-45
## 30: Ndufb3 0.8726596 -5.789674e-32 1.029382e-29 -1.040961e-29
## 31: Pea15a 0.6244466 4.269655e-39 1.108934e-36 -1.100394e-36
## 32: Plekhh1 0.2981824 -6.294926e-02 8.078897e+00 -8.204796e+00
## 33: Psmb2 0.5809874 -2.133274e-57 4.928456e-55 -4.971121e-55
## 34: Rps8 0.6522889 -5.973655e-56 9.287918e-54 -9.407391e-54
## 35: Scn8a 0.8821164 -6.395409e-24 6.405901e-22 -6.533809e-22
## 36: Setd8 0.9733859 -1.743011e-30 1.761610e-28 -1.796471e-28
## 37: Smc1a 0.2287321 -3.220855e-31 6.040827e-29 -6.105244e-29
## 38: Snhg9 0.2946691 1.289446e-09 1.580629e-07 -1.554840e-07
## 39: Spred1 0.9378469 2.720893e-04 3.024604e-02 -2.970186e-02
## 40: Stmn1 0.9654608 5.815127e-30 7.697017e-28 -7.580714e-28
## 41: Tgoln1 0.5152793 -1.326362e+00 1.835522e+01 -2.100794e+01
## 42: Tlk1 0.7640892 2.703688e-15 3.932912e-13 -3.878838e-13
## 43: Tmem184b 0.8618506 -5.367958e-16 1.029059e-12 -1.030133e-12
## 44: Tmx1 0.5806770 4.805034e-16 8.595864e-14 -8.499763e-14
## 45: Tnks2 0.4062025 -2.013906e-27 2.384359e-25 -2.424638e-25
## 46: Tnrc6c 0.9636313 -8.209821e-17 1.008796e-14 -1.025216e-14
## 47: Tra2b 0.8040551 5.456255e-13 1.029543e-10 -1.018631e-10
## 48: Ttc14 0.9457661 -2.917359e-28 3.730229e-26 -3.788576e-26
## 49: Unc93b1 0.6918524 -1.652286e-06 1.721162e-04 -1.754208e-04
## 50: Zfp148 0.7535605 -2.006690e-21 2.018380e-19 -2.058514e-19
## primerid Pr(>Chisq) coef ci.hi ci.lo
## fdr
## 1: 0.9733859
## 2: 0.9733859
## 3: 0.9733859
## 4: 0.9733859
## 5: 0.9733859
## 6: 0.9733859
## 7: 0.9733859
## 8: 0.9733859
## 9: 0.9733859
## 10: 0.9733859
## 11: 0.9733859
## 12: 0.9733859
## 13: 0.9733859
## 14: 0.9733859
## 15: 0.9733859
## 16: 0.9733859
## 17: 0.9733859
## 18: 0.9733859
## 19: 0.9733859
## 20: 0.9733859
## 21: 0.9733859
## 22: 0.9733859
## 23: 0.9733859
## 24: 0.9733859
## 25: 0.9733859
## 26: 0.9733859
## 27: 0.9733859
## 28: 0.9733859
## 29: 0.9733859
## 30: 0.9733859
## 31: 0.9733859
## 32: 0.9733859
## 33: 0.9733859
## 34: 0.9733859
## 35: 0.9733859
## 36: 0.9733859
## 37: 0.9733859
## 38: 0.9733859
## 39: 0.9733859
## 40: 0.9733859
## 41: 0.9733859
## 42: 0.9733859
## 43: 0.9733859
## 44: 0.9733859
## 45: 0.9733859
## 46: 0.9733859
## 47: 0.9733859
## 48: 0.9733859
## 49: 0.9733859
## 50: 0.9733859
## fdr
# sth wrong here, need to be fixed