Permutation

library(devtools)
library(Biobase)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

library(limma)

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

library(edge)
library(genefilter)
library(qvalue) # Testing this!

Download the expression data set

Here we are going to use some data from the paper Evaluating gene expression in C57BL/6J and DBA/2J mouse striatum using RNA-Seq and microarrays. that is a comparative RNA-seq analysis of different mouse strains.

https://www.ncbi.nlm.nih.gov/pubmed?term=21455293

con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file = con)
close(con)
bot = bottomly.eset
pdata = pData(bot)
edata = as.matrix(exprs(bot))
fdata = fData(bot)
ls()

## [1] "bot"           "bottomly.eset" "con"           "edata"        
## [5] "fdata"         "pdata"

Transform the data

Here we will transform the data and remove lowly expressed genes.

edata = log2(as.matrix(edata) + 1)
edata = edata[rowMeans(edata) > 10, ]

Calculate p-values parametrically

With genefilter
Adjusting for variables with edge
P-values for moderated statistics with limma

There are a number of ways to calculate p-values directly. Here are a couple of examples, but there are many data-specific ways of calculating them using packages like snpStats or DESeq2 or diffbind.

Genefilter package contains ‘rowFtests’

fstats_obj = rowFtests(edata,as.factor(pdata$strain))
names(fstats_obj)

## [1] "statistic" "p.value"

hist(fstats_obj$p.value, col = 3)

EDGE - Adjusting for variables with ‘EDGE’ package

If you want to adjust for variables you need to use edge

build study first
adjust for lane number: ‘adj.var = as.factor(pdata$lane.number))’
Perform likelihood ratio test

edge_study = build_study(edata, 
                         grp = pdata$strain, 
                         adj.var = as.factor(pdata$lane.number))
de_obj = lrt(edge_study) # Likelihood ratio test
qval = qvalueObj(de_obj)
hist(qval$pvalues, col = 3)

Q-values

Q-values are used sometimes in conjuncton ith false discovery rates (FDR), where \[FDR = E[Q]\] while Q is a measure of the false discovery rates, \[Q = \frac{V}{V+S}\] where V = false positive rate a.k.a. Type 1 error, S = false negative rate a.k.a. Type 2 error.

LIMMA - P-values for moderated statistics with ‘LIMMA’ package

mod = model.matrix(~ pdata$strain + pdata$lane.number) # modereated model
fit_limma = lmFit(edata, mod)
ebayes_limma = eBayes(fit_limma)
limma_pvals = topTable(ebayes_limma,
                       number = dim(edata)[1])$P.Value

## Removing intercept from test coefficients

hist(limma_pvals, col = 4)

Calculating Empirical permutation p-values with ‘EDGE’ package

Often when you permute you are trying to calculate an empirical p-value. To do this we can compare each observed statistic to the permuted statistics. You can either compare within a single gene (argument pooled = FALSE in the empPvals function) or pooling the permuted statistics across multiple genes (argument pooled = TRUE in the empPvals function, the default).

set.seed(3333)
B = 1000
tstats_obj = rowttests(edata, pdata$strain) # Factor is strain
tstat0 = matrix(NA, nrow = dim(edata)[1], ncol = B)
tstat = tstats_obj$statistic
strain = pdata$strain
for(i in 1:B){
  strain0 = sample(strain)
  tstat0[,i] = rowttests(edata,strain0)$statistic
}

emp_pvals = empPvals(tstat, tstat0)
hist(emp_pvals, col = 2, main = "Empirical Values")

Multiple testing

Bonferroni and Benjamini-Hochberg FDR correction with p.adjust
Adjusted p-values from limma
Direct q-values
q-values using edge

To correct for multiple testing you can use the Bonferroni correction or different FDR corrections.

Bonferroni FDR correction with p.adjust

You can use the p.adjust function to get “multiple testing corrected” p-values which you can then use to control error rates.

fp_bonf = p.adjust(fstats_obj$p.value, method = "bonferroni")
hist(fp_bonf, col = 3)

quantile(fp_bonf)

##        0%       25%       50%       75%      100% 
## 0.1791152 1.0000000 1.0000000 1.0000000 1.0000000

sum(fp_bonf < 0.05)

## [1] 0

#fp_bonf < 0.05

sum(fp_bonf < 0.05)

## [1] 0

Benjamini-Hochberg (BH) FDR correction with p.adjust

# Benjamini-Hochberg False Discovery Rate (FDR) correction
fp_bh = p.adjust(fstats_obj$p.value, method = "BH")
hist(fp_bh, col = 3)

sum(fp_bh < 0.05)

## [1] 0

LIMMA - Adjusted p-values from ‘LIMMA’ package

limma_pvals_adj = topTable(ebayes_limma,
                           number=dim(edata)[1])$adj.P.Val

## Removing intercept from test coefficients

hist(limma_pvals_adj, col = 2)

# Adjusted Pvalues for limma model
quantile(limma_pvals_adj)

##          0%         25%         50%         75%        100% 
## 0.002770085 0.923682815 0.923682815 0.923682815 0.996015322

Direct q-values

qval_limma = qvalue(limma_pvals)
summary(qval_limma)

## 
## Call:
## qvalue(p = limma_pvals)
## 
## pi0: 0.3798397   
## 
## Cumulative number of significant calls:
## 
##           <1e-04 <0.001 <0.01 <0.025 <0.05 <0.1   <1
## p-value        2      2     4      7    21   41 1049
## q-value        0      0     2      2     2    2 1049
## local FDR      0      0     1      2     2    2    2

qval$pi0

## [1] 1

q-values using EDGE

qval = qvalueObj(de_obj)
summary(qval)

## 
## Call:
## qvalue(p = pval)
## 
## pi0: 1   
## 
## Cumulative number of significant calls:
## 
##           <1e-04 <0.001 <0.01 <0.025 <0.05 <0.1   <1
## p-value        0      0     1      2     8   20 1049
## q-value        0      0     0      0     0    0 1049
## local FDR      0      0     0      0     0    0    0

More information

Multiple testing is one of the most commmonly used tools in statistical genomics.

Statistical significance for genome-wide studies is a great place to start. http://www.pnas.org/content/100/16/9440.full
The qvalue package vignette is also informative. https://www.bioconductor.org/packages/release/bioc/vignettes/qvalue/inst/doc/qvalue.pdf
This is a nice review on How does multiple testing correction work? https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2907892/

sessionInfo()

## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] qvalue_2.6.0        genefilter_1.56.0   edge_2.6.0         
## [4] limma_3.30.13       Biobase_2.34.0      BiocGenerics_0.20.0
## [7] devtools_1.13.2    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.11         lfa_1.4.0            nloptr_1.0.4        
##  [4] plyr_1.8.4           bitops_1.0-6         tools_3.3.1         
##  [7] digest_0.6.12        lme4_1.1-12          RSQLite_1.1-2       
## [10] annotate_1.52.1      evaluate_0.10        memoise_1.1.0       
## [13] tibble_1.3.3         gtable_0.2.0         nlme_3.1-131        
## [16] lattice_0.20-35      mgcv_1.8-17          jackstraw_1.1       
## [19] rlang_0.1.1          Matrix_1.2-8         DBI_0.6-1           
## [22] yaml_2.1.14          snm_1.22.0           withr_1.0.2         
## [25] stringr_1.2.0        knitr_1.16           IRanges_2.8.1       
## [28] S4Vectors_0.12.1     stats4_3.3.1         rprojroot_1.2       
## [31] grid_3.3.1           AnnotationDbi_1.36.2 survival_2.41-2     
## [34] XML_3.98-1.7         rmarkdown_1.6        sva_3.22.0          
## [37] minqa_1.2.4          ggplot2_2.2.1        reshape2_1.4.2      
## [40] corpcor_1.6.8        magrittr_1.5         backports_1.1.0     
## [43] scales_0.4.1         htmltools_0.3.6      MASS_7.3-45         
## [46] splines_3.3.1        xtable_1.8-2         colorspace_1.3-2    
## [49] stringi_1.1.5        RCurl_1.95-4.8       lazyeval_0.2.0      
## [52] munsell_0.4.3

GLM_3_14_pvalues_multiple_testing

MCC

July 6, 2017