See: http://jtleek.com/genstats/inst/doc/03_14_P-values-and-Multiple-Testing.html
library(devtools)
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(edge)
library(genefilter)
library(qvalue) # Testing this!
Here we are going to use some data from the paper Evaluating gene expression in C57BL/6J and DBA/2J mouse striatum using RNA-Seq and microarrays. that is a comparative RNA-seq analysis of different mouse strains.
https://www.ncbi.nlm.nih.gov/pubmed?term=21455293
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file = con)
close(con)
bot = bottomly.eset
pdata = pData(bot)
edata = as.matrix(exprs(bot))
fdata = fData(bot)
ls()
## [1] "bot" "bottomly.eset" "con" "edata"
## [5] "fdata" "pdata"
Here we will transform the data and remove lowly expressed genes.
edata = log2(as.matrix(edata) + 1)
edata = edata[rowMeans(edata) > 10, ]
There are a number of ways to calculate p-values directly. Here are a couple of examples, but there are many data-specific ways of calculating them using packages like snpStats or DESeq2 or diffbind.
fstats_obj = rowFtests(edata,as.factor(pdata$strain))
names(fstats_obj)
## [1] "statistic" "p.value"
hist(fstats_obj$p.value, col = 3)
If you want to adjust for variables you need to use edge
edge_study = build_study(edata,
grp = pdata$strain,
adj.var = as.factor(pdata$lane.number))
de_obj = lrt(edge_study) # Likelihood ratio test
qval = qvalueObj(de_obj)
hist(qval$pvalues, col = 3)
Q-values are used sometimes in conjuncton ith false discovery rates (FDR), where \[FDR = E[Q]\] while Q is a measure of the false discovery rates, \[Q = \frac{V}{V+S}\] where V = false positive rate a.k.a. Type 1 error, S = false negative rate a.k.a. Type 2 error.
mod = model.matrix(~ pdata$strain + pdata$lane.number) # modereated model
fit_limma = lmFit(edata, mod)
ebayes_limma = eBayes(fit_limma)
limma_pvals = topTable(ebayes_limma,
number = dim(edata)[1])$P.Value
## Removing intercept from test coefficients
hist(limma_pvals, col = 4)
Often when you permute you are trying to calculate an empirical p-value. To do this we can compare each observed statistic to the permuted statistics. You can either compare within a single gene (argument pooled = FALSE in the empPvals function) or pooling the permuted statistics across multiple genes (argument pooled = TRUE in the empPvals function, the default).
set.seed(3333)
B = 1000
tstats_obj = rowttests(edata, pdata$strain) # Factor is strain
tstat0 = matrix(NA, nrow = dim(edata)[1], ncol = B)
tstat = tstats_obj$statistic
strain = pdata$strain
for(i in 1:B){
strain0 = sample(strain)
tstat0[,i] = rowttests(edata,strain0)$statistic
}
emp_pvals = empPvals(tstat, tstat0)
hist(emp_pvals, col = 2, main = "Empirical Values")
To correct for multiple testing you can use the Bonferroni correction or different FDR corrections.
You can use the p.adjust function to get “multiple testing corrected” p-values which you can then use to control error rates.
fp_bonf = p.adjust(fstats_obj$p.value, method = "bonferroni")
hist(fp_bonf, col = 3)
quantile(fp_bonf)
## 0% 25% 50% 75% 100%
## 0.1791152 1.0000000 1.0000000 1.0000000 1.0000000
sum(fp_bonf < 0.05)
## [1] 0
#fp_bonf < 0.05
sum(fp_bonf < 0.05)
## [1] 0
# Benjamini-Hochberg False Discovery Rate (FDR) correction
fp_bh = p.adjust(fstats_obj$p.value, method = "BH")
hist(fp_bh, col = 3)
sum(fp_bh < 0.05)
## [1] 0
limma_pvals_adj = topTable(ebayes_limma,
number=dim(edata)[1])$adj.P.Val
## Removing intercept from test coefficients
hist(limma_pvals_adj, col = 2)
# Adjusted Pvalues for limma model
quantile(limma_pvals_adj)
## 0% 25% 50% 75% 100%
## 0.002770085 0.923682815 0.923682815 0.923682815 0.996015322
qval_limma = qvalue(limma_pvals)
summary(qval_limma)
##
## Call:
## qvalue(p = limma_pvals)
##
## pi0: 0.3798397
##
## Cumulative number of significant calls:
##
## <1e-04 <0.001 <0.01 <0.025 <0.05 <0.1 <1
## p-value 2 2 4 7 21 41 1049
## q-value 0 0 2 2 2 2 1049
## local FDR 0 0 1 2 2 2 2
qval$pi0
## [1] 1
qval = qvalueObj(de_obj)
summary(qval)
##
## Call:
## qvalue(p = pval)
##
## pi0: 1
##
## Cumulative number of significant calls:
##
## <1e-04 <0.001 <0.01 <0.025 <0.05 <0.1 <1
## p-value 0 0 1 2 8 20 1049
## q-value 0 0 0 0 0 0 1049
## local FDR 0 0 0 0 0 0 0
More information
Multiple testing is one of the most commmonly used tools in statistical genomics.
Statistical significance for genome-wide studies is a great place to start. http://www.pnas.org/content/100/16/9440.full
The qvalue package vignette is also informative. https://www.bioconductor.org/packages/release/bioc/vignettes/qvalue/inst/doc/qvalue.pdf
This is a nice review on How does multiple testing correction work? https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2907892/
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] qvalue_2.6.0 genefilter_1.56.0 edge_2.6.0
## [4] limma_3.30.13 Biobase_2.34.0 BiocGenerics_0.20.0
## [7] devtools_1.13.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.11 lfa_1.4.0 nloptr_1.0.4
## [4] plyr_1.8.4 bitops_1.0-6 tools_3.3.1
## [7] digest_0.6.12 lme4_1.1-12 RSQLite_1.1-2
## [10] annotate_1.52.1 evaluate_0.10 memoise_1.1.0
## [13] tibble_1.3.3 gtable_0.2.0 nlme_3.1-131
## [16] lattice_0.20-35 mgcv_1.8-17 jackstraw_1.1
## [19] rlang_0.1.1 Matrix_1.2-8 DBI_0.6-1
## [22] yaml_2.1.14 snm_1.22.0 withr_1.0.2
## [25] stringr_1.2.0 knitr_1.16 IRanges_2.8.1
## [28] S4Vectors_0.12.1 stats4_3.3.1 rprojroot_1.2
## [31] grid_3.3.1 AnnotationDbi_1.36.2 survival_2.41-2
## [34] XML_3.98-1.7 rmarkdown_1.6 sva_3.22.0
## [37] minqa_1.2.4 ggplot2_2.2.1 reshape2_1.4.2
## [40] corpcor_1.6.8 magrittr_1.5 backports_1.1.0
## [43] scales_0.4.1 htmltools_0.3.6 MASS_7.3-45
## [46] splines_3.3.1 xtable_1.8-2 colorspace_1.3-2
## [49] stringi_1.1.5 RCurl_1.95-4.8 lazyeval_0.2.0
## [52] munsell_0.4.3