Matthew Curcio, May 8, 2018

Introductory Material

Dependencies / Libraries

# Remove '#' to install packages:

#install.packages("devtools")
#install.packages("MatrixEQTL")
#source("http://www.bioconductor.org/biocLite.R")
#biocLite(c("Biobase"))
#install.packages(broom)
library(devtools)
library(broom)
library(Biobase)
library(MatrixEQTL)

Download the data

Here we are going to follow along with the tutorial on MatrixEQTL.

First we find the files http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/runit.html

base.dir = find.package("MatrixEQTL")
SNP_file_name = paste(base.dir, "/data/SNP.txt", sep = "");
expression_file_name = paste(base.dir, "/data/GE.txt", sep = "")
covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep = "")
output_file_name = tempfile()

Next we load the data so we can see it

expr = read.table(expression_file_name,
                  sep = "\t",
                  header = T, 
                  row.names = 1)
expr[1,]
##         Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09
## Gene_01   4.91   4.63   5.18   5.07   5.74   5.09   5.31   5.29   4.73
##         Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16
## Gene_01   5.72   4.75   4.54   5.01   5.03   4.84   4.44
snps = read.table(SNP_file_name,
                  sep = "\t",
                  header = T,
                  row.names = 1)
snps[1,]
##        Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09
## Snp_01      2      0      2      0      2      1      2      1      1
##        Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16
## Snp_01      1      2      2      1      2      2      1
cvrt = read.table(covariates_file_name,
                  sep = "\t",
                  header = T,
                  row.names = 1)
cvrt[1,]
##        Sam_01 Sam_02 Sam_03 Sam_04 Sam_05 Sam_06 Sam_07 Sam_08 Sam_09
## gender      0      0      0      0      0      0      0      0      1
##        Sam_10 Sam_11 Sam_12 Sam_13 Sam_14 Sam_15 Sam_16
## gender      1      1      1      1      1      1      1

eQTL is linear regession

The simplest eQTL analysis just computes linear regression models for each SNP/gene pair.

e1 = as.numeric(expr[1,])
e1
##  [1] 4.91 4.63 5.18 5.07 5.74 5.09 5.31 5.29 4.73 5.72 4.75 4.54 5.01 5.03
## [15] 4.84 4.44
s1 = as.numeric(snps[1,])
s1
##  [1] 2 0 2 0 2 1 2 1 1 1 2 2 1 2 2 1
lm1 = lm(e1 ~ s1)
lm1
## 
## Call:
## lm(formula = e1 ~ s1)
## 
## Coefficients:
## (Intercept)           s1  
##     4.92968      0.06387
tidy(lm1)
##          term   estimate std.error  statistic      p.value
## 1 (Intercept) 4.92967742 0.2137508 23.0627348 1.546033e-12
## 2          s1 0.06387097 0.1386998  0.4604979 6.522304e-01

We can visualize the data and the model fits

plot(e1 ~ s1,
     col = (s1 + 1),
     main = "Linear Fit of Expression Vs Genotype",
     sub = "Y = 0.06387097 * X + 4.92967742",
     xaxt = "n",
     xlab = "Genotype",
     ylab = "Expression")

axis(1, at = c(0:2), labels = c("AA", "Aa", "aa"))

lines(lm1$fitted ~ s1, type = "b", pch = 15, col = "blue")

Fitting many eQTL models with MatrixEQTL

Set general parameters

We need to set up the p-value cutoff and the error model (in this case assuming independent errors)

pvOutputThreshold = 1e-2
errorCovariance = numeric()
useModel = modelLINEAR

Set the data up

Now we need to set up the snp and gene expression data in the special format required by the MatrixEQTL package

snps = SlicedData$new()
snps$fileDelimiter = "\t"       # the TAB character
snps$fileOmitCharacters = "NA"  # denote missing values;
snps$fileSkipRows = 1           # one row of column labels
snps$fileSkipColumns = 1        # one column of row labels
snps$fileSliceSize = 2000       # read file in pieces of 2,000 rows
snps$LoadFile( SNP_file_name )
## Rows read: 15 done.
gene = SlicedData$new()
gene$fileDelimiter = "\t"           # the TAB character
gene$fileOmitCharacters = "NA"      # denote missing values;
gene$fileSkipRows = 1               # one row of column labels
gene$fileSkipColumns = 1            # one column of row labels
gene$fileSliceSize = 2000           # read file in pieces of 2,000 rows
gene$LoadFile(expression_file_name)
## Rows read: 10 done.
cvrt = SlicedData$new()

Running MatrixEQTL

We can now run the code to calculate the eQTL that we are interested in

me = Matrix_eQTL_engine(snps = snps,
                        gene = gene,
                        cvrt = cvrt,
                        output_file_name = NULL,
                        # vOutputThreshold = pvOutputThreshold,
                        useModel = useModel,
                        errorCovariance = errorCovariance,
                        verbose = TRUE,
                        pvalue.hist = TRUE,
                        min.pv.by.genesnp = FALSE,
                        noFDRsaveMemory = FALSE)
## Processing covariates
## Task finished in 0.00300000000000011 seconds
## Processing gene expression data (imputation, residualization)
## Task finished in 0.00300000000000011 seconds
## Creating output file(s)
## Task finished in 0.0129999999999999 seconds
## Performing eQTL analysis
## 100.00% done, 0 eQTLs
## No significant associations were found.
## Task finished in 0.00600000000000023 seconds
## 

Visualizing the results

We can make a plot of all the p-values from the tests

plot(me)

We can also figure look at the number and type of eQTL

me$all$neqtls
## [1] 0
me$all$eqtls
## [1] snps      gene      beta      statistic pvalue    FDR      
## <0 rows> (or 0-length row.names)

More information

eQTL is an entire field of research.

MatrixEQTL package * http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/

MatrixEQTL paper * http://bioinformatics.oxfordjournals.org/content/28/10/1353.abstract?keytype=ref&ijkey=zjMWpHTAUk7OJFw

Surrogate variables applied to eQTL * http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0030161

A Bayesian sva framework for eQTL * http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1000770

Bioconductor eQTL workflow * http://www.bioconductor.org/help/workflows/eQTL/

devtools::session_info()
## Session info -------------------------------------------------------------
##  setting  value                       
##  version  R version 3.4.4 (2018-03-15)
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language en_US                       
##  collate  en_US.UTF-8                 
##  tz       America/New_York            
##  date     2018-05-08
## Packages -----------------------------------------------------------------
##  package      * version date       source         
##  assertthat     0.2.0   2017-04-11 CRAN (R 3.4.4) 
##  backports      1.1.2   2017-12-13 CRAN (R 3.4.4) 
##  base         * 3.4.4   2018-03-16 local          
##  bindr          0.1.1   2018-03-13 CRAN (R 3.4.4) 
##  bindrcpp       0.2.2   2018-03-29 CRAN (R 3.4.4) 
##  Biobase      * 2.38.0  2018-05-09 Bioconductor   
##  BiocGenerics * 0.24.0  2018-05-09 Bioconductor   
##  broom        * 0.4.4   2018-03-29 CRAN (R 3.4.4) 
##  compiler       3.4.4   2018-03-16 local          
##  datasets     * 3.4.4   2018-03-16 local          
##  devtools     * 1.13.5  2018-02-18 CRAN (R 3.4.4) 
##  digest         0.6.15  2018-01-28 CRAN (R 3.4.4) 
##  dplyr          0.7.4   2017-09-28 CRAN (R 3.4.4) 
##  evaluate       0.10.1  2017-06-24 cran (@0.10.1) 
##  foreign        0.8-70  2018-04-23 CRAN (R 3.4.4) 
##  glue           1.2.0   2017-10-29 cran (@1.2.0)  
##  graphics     * 3.4.4   2018-03-16 local          
##  grDevices    * 3.4.4   2018-03-16 local          
##  grid           3.4.4   2018-03-16 local          
##  htmltools      0.3.6   2017-04-28 cran (@0.3.6)  
##  knitr          1.20    2018-02-20 CRAN (R 3.4.4) 
##  lattice        0.20-35 2017-03-25 CRAN (R 3.4.4) 
##  magrittr       1.5     2014-11-22 cran (@1.5)    
##  MatrixEQTL   * 2.2     2018-01-13 CRAN (R 3.4.4) 
##  memoise        1.1.0   2017-04-21 CRAN (R 3.4.4) 
##  methods      * 3.4.4   2018-03-16 local          
##  mnormt         1.5-5   2016-10-15 CRAN (R 3.4.4) 
##  nlme           3.1-137 2018-04-07 CRAN (R 3.4.4) 
##  parallel     * 3.4.4   2018-03-16 local          
##  pillar         1.2.2   2018-04-26 CRAN (R 3.4.4) 
##  pkgconfig      2.0.1   2017-03-21 CRAN (R 3.4.4) 
##  plyr           1.8.4   2016-06-08 CRAN (R 3.4.4) 
##  psych          1.8.4   2018-05-06 CRAN (R 3.4.4) 
##  purrr          0.2.4   2017-10-18 CRAN (R 3.4.4) 
##  R6             2.2.2   2017-06-17 CRAN (R 3.4.4) 
##  Rcpp           0.12.16 2018-03-13 cran (@0.12.16)
##  reshape2       1.4.3   2017-12-11 CRAN (R 3.4.4) 
##  rlang          0.2.0   2018-02-20 CRAN (R 3.4.4) 
##  rmarkdown      1.9     2018-03-01 CRAN (R 3.4.4) 
##  rprojroot      1.3-2   2018-01-03 CRAN (R 3.4.4) 
##  stats        * 3.4.4   2018-03-16 local          
##  stringi        1.2.2   2018-05-02 CRAN (R 3.4.4) 
##  stringr        1.3.0   2018-02-19 cran (@1.3.0)  
##  tibble         1.4.2   2018-01-22 CRAN (R 3.4.4) 
##  tidyr          0.8.0   2018-01-29 CRAN (R 3.4.4) 
##  tools          3.4.4   2018-03-16 local          
##  utils        * 3.4.4   2018-03-16 local          
##  withr          2.1.2   2018-03-15 CRAN (R 3.4.4) 
##  yaml           2.1.19  2018-05-01 CRAN (R 3.4.4)