Download

Preparation and basic stats

Genomic data set - chr22_GEUVADIS_358_continental

Loading the data

library(readr)
library(tidyr)
library(dplyr)

snps = read_tsv("/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/SNP_voja.txt")
expr = read_tsv("/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/GE_voja.txt")

Basic stats

To get a basic idea of how many samples, genes and which genotyped positions are available in the data set, we’ll print out the “head” and dimensions of the loaded data sets. For the first 10 genes we’ll print out the mean expression levels + standard deviation. The same goes for the first 10 SNPs in the snps data frame.

head(expr)[1:12]
## # A tibble: 6 x 12
##   geneid   HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105
##   <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 ENSG000…  0.788   1.60    1.28    1.87    1.86    0.503   1.78    1.86  
## 2 ENSG000…  0.461   1.30    0.0309  0.933   1.44    0.121   1.84    0.358 
## 3 ENSG000…  0.331   0.641   0.0236  0.115   0.464   0.211   0.844   0.164 
## 4 ENSG000…  0.156   0.297   0       0.0866  0.277   0.0996  0.823   0     
## 5 ENSG000…  3.06    2.76    3.17    2.23    3.43    2.15    2.88    3.14  
## 6 ENSG000…  0.0466  0.0221  0.0399  0.0258  0.0550  0.0149  0.0307  0.0461
## # ... with 3 more variables: HG00106 <dbl>, HG00108 <dbl>, HG00109 <dbl>
dim(expr)
## [1] 608 359
head(snps)[1:12]
## # A tibble: 6 x 12
##   snipid   HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105
##   <chr>      <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>
## 1 chr22_1…       0       1       1       0       0       1       0       0
## 2 chr22_1…       1       0       0       0       0       0       1       0
## 3 chr22_1…       0       1       1       0       0       1       0       0
## 4 chr22_1…       0       0       0       0       0       0       0       0
## 5 chr22_1…       0       0       0       0       0       0       0       0
## 6 chr22_1…       1       1       1       2       2       1       1       1
## # ... with 3 more variables: HG00106 <int>, HG00108 <int>, HG00109 <int>
dim(snps)
## [1] 85485   359
paste("Mean value of expression for gene ",expr$geneid[1:10]," is ", rowMeans(expr[1:10, -1]))
##  [1] "Mean value of expression for gene  ENSG00000215270.3  is  1.37195382262709"   
##  [2] "Mean value of expression for gene  ENSG00000237438.2  is  1.47948167480269"   
##  [3] "Mean value of expression for gene  ENSG00000273203.1  is  0.658849616400615"  
##  [4] "Mean value of expression for gene  ENSG00000273442.1  is  0.30150779514162"   
##  [5] "Mean value of expression for gene  ENSG00000177663.9  is  2.76893802934637"   
##  [6] "Mean value of expression for gene  ENSG00000183307.3  is  0.0330109958248743" 
##  [7] "Mean value of expression for gene  ENSG00000069998.8  is  12.5103110517179"   
##  [8] "Mean value of expression for gene  ENSG00000093072.11  is  9.5873076093771"   
##  [9] "Mean value of expression for gene  ENSG00000099954.14  is  0.0257808687212151"
## [10] "Mean value of expression for gene  ENSG00000182902.9  is  0.0559065276387877"
paste("Standard deviation of expression for gene ", expr$geneid[1:10]," is ", t(apply(expr[1:10, -1], 1, sd)))
##  [1] "Standard deviation of expression for gene  ENSG00000215270.3  is  0.472651252805272"  
##  [2] "Standard deviation of expression for gene  ENSG00000237438.2  is  1.20578650765609"   
##  [3] "Standard deviation of expression for gene  ENSG00000273203.1  is  0.516317432602619"  
##  [4] "Standard deviation of expression for gene  ENSG00000273442.1  is  0.30999282888107"   
##  [5] "Standard deviation of expression for gene  ENSG00000177663.9  is  0.663435616569955"  
##  [6] "Standard deviation of expression for gene  ENSG00000183307.3  is  0.022646815889866"  
##  [7] "Standard deviation of expression for gene  ENSG00000069998.8  is  2.92736948283107"   
##  [8] "Standard deviation of expression for gene  ENSG00000093072.11  is  5.07079616962829"  
##  [9] "Standard deviation of expression for gene  ENSG00000099954.14  is  0.0384475820388767"
## [10] "Standard deviation of expression for gene  ENSG00000182902.9  is  0.0309177015985779"
paste("Mean value for SNP ",snps$snipid[1:10]," is ", rowMeans(snps[1:10, -1]))
##  [1] "Mean value for SNP  chr22_16051249  is  0.187150837988827"
##  [2] "Mean value for SNP  chr22_16052080  is  0.251396648044693"
##  [3] "Mean value for SNP  chr22_16052962  is  0.153631284916201"
##  [4] "Mean value for SNP  chr22_16052986  is  0.108938547486034"
##  [5] "Mean value for SNP  chr22_16053444  is  0.108938547486034"
##  [6] "Mean value for SNP  chr22_16053659  is  1.4413407821229"  
##  [7] "Mean value for SNP  chr22_16053791  is  0.53072625698324" 
##  [8] "Mean value for SNP  chr22_16053862  is  0.187150837988827"
##  [9] "Mean value for SNP  chr22_16053863  is  0.251396648044693"
## [10] "Mean value for SNP  chr22_16054454  is  0.187150837988827"
paste("Standard deviation for SNP ", snps$snipid[1:10]," is ", t(apply(snps[1:10, -1], 1, sd)))
##  [1] "Standard deviation for SNP  chr22_16051249  is  0.418282783518628"
##  [2] "Standard deviation for SNP  chr22_16052080  is  0.434423220035923"
##  [3] "Standard deviation for SNP  chr22_16052962  is  0.376294327742715"
##  [4] "Standard deviation for SNP  chr22_16052986  is  0.320850570593205"
##  [5] "Standard deviation for SNP  chr22_16053444  is  0.320850570593205"
##  [6] "Standard deviation for SNP  chr22_16053659  is  0.635698421409098"
##  [7] "Standard deviation for SNP  chr22_16053791  is  0.619844588756408"
##  [8] "Standard deviation for SNP  chr22_16053862  is  0.418282783518628"
##  [9] "Standard deviation for SNP  chr22_16053863  is  0.434423220035923"
## [10] "Standard deviation for SNP  chr22_16054454  is  0.418282783518628"

MAF

Minor allele frequency (MAF) is a measure of the presence of an allele in a population. More precisely MAF refers to the frequency at which the second most common allele occurs in a given population. Every individual person has about 4.1-5 million bases in which he differs from the reference genome (ref). Some of those variants are common in a certain population, others not. In order to measure the “rareness”" of a specific variant (allele) MAF can be calculated. In case of single nucleotide polymorphisms there can be up to four different alleles in one position (A, C, G, T). Those alleles can be homozygous or heterozygous, when the maternal allele was different from the paternal allele at that position. SNPs in a population are always defined by the genomic position and by two alleles: The allele defined in the reference genome and one allele present in some individuals, but different from the reference sequence.

An example:

Spotting the differences.

Spotting the differences.

This individual has two SNPs, he is heterozygous in one of them and homozygous in the other. In eQTL analyses a SNP is always defined as a single allele being different from the reference. If in a population there are multiple different alleles for one position, they would be treated as independent entities of SNPs. Now we know that in eQTL analyses SNPs can only have two alleles: The reference and the alternative. Calculating MAF is essentially counting the presence of the alleles in a population and representing it as a percentage. Each individual can have 0, 1 or 2 times the alternative allele.

Example

The term MAF implies that the allele for which we return the measure has to be the minor (= less common) allele. This means that the MAF is smaller than 0.5 by definition.

So let’s now calculate the MAF for all SNPs among all individuals and correct the returned values so that the value is always given in respect to the minor allele. Then we’ll plot a histogram of the MAFs of all SNPs

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
maf = rowMeans(snps[-1])/2
sum(maf > 0.5) # so there are 21130 alleses which are actually major alleles in this population sample
## [1] 21130
# To ensure we actually get the MAF this needs to be inverted.
maf <- pmin(maf, 1-maf)

truehist(maf, main = "Histogram of MAF values.", col = "steelblue")
lines(density(maf), lty = 2, col = "darkorange", lwd = 3)

Filtering SNPs by MAF

In an eQTL study often a minimum MAF is required. Since MAF essentially reflects how often an allele has been observed in a population, it also defines how often the gene expression levels have been observed for heterozygous and homozygous alleles.

Example

Calculate the number of heterozygous and homozygous observations expected for SNPs with a MAF of 5%, 10% and 15% in a sample of 500 individuals given Hardy-Weinberg equilibrium. What are useful MAF thresholds for SNPs to include in an eQTL analysis?

p = c(0.05, 0.1, 0.15)
q = 1-p
# Calulate frequency of minor allele being present in homozygous and heterozygous state
f_hom = p^2
f_het = 2*p*q
# Expected number of observations in a sample size of 10000
sample_size = 500
round(f_hom * sample_size)
## [1]  1  5 11
round(f_het * sample_size)
## [1]  48  90 128

Gene expression profiling

Now that we have an idea of what is stored in the genotype data frame let’s take a look at the expression data. For eQTL analyses it is important for the gene expression to be normally distributed among samples, therefore RNA-seq data has to be transformed by, for example quantile normalization.

Let’s check the distribution of gene expression levels across samples for the first gene in our expr data frame.

gname = expr$geneid[1]
gname
## [1] "ENSG00000215270.3"
truehist(as.numeric(filter(expr, geneid == gname)[-1]), main = paste("Gene expression profile for gene:",gname), xlab = "Expression level", col = "darkorange")
lines(density(as.numeric(filter(expr, geneid == gname)[-1])), lty = 2, col = "steelblue", lwd = 3)

Example

Now we’ll plot the expression levels of the first gene against the first SNP, 10th gene against the 10th SNP, 10th gene against the 1th SNP, as well as the 10th gene against the 6th SNP, depending on the genotypes of the samples by using simple dot plots. We’ll add a bit of random noise (jitter) to the genotype data to make it all look more comprehensible.

genotype = c("snp_1", "snp_10", "snp_1", "snp_6")
genes = c("gene_1", "gene_10", "gene_10", "gene_10")
par(mfrow=c(1,length(genotype)))

plot(jitter(as.numeric(snps[1,-1]), factor = 0.5), as.numeric(expr[1,-1]),
     xlab = genotype[1], ylab = genes[1], col = "steelblue",
     main = paste(genes[1], "vs", genotype[1]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
plot(jitter(as.numeric(snps[10,-1]), factor = 0.5), as.numeric(expr[10,-1]),
     xlab = genotype[2], ylab = genes[2], col = "steelblue",
     main = paste(genes[2], "vs", genotype[2]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
plot(jitter(as.numeric(snps[1,-1]), factor = 0.5), as.numeric(expr[10,-1]),
     xlab = genotype[3], ylab = genes[3], col = "steelblue",
     main = paste(genes[3], "vs", genotype[3]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
plot(jitter(as.numeric(snps[6,-1]), factor = 0.5), as.numeric(expr[10,-1]),
     xlab = genotype[4], ylab = genes[4], col = "steelblue",
     main = paste(genes[4], "vs", genotype[4]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))

Let’s do a bit of data wrangling for easier downstream analysis and efficient plotting. Namely we’ll transpose our data frames so that we have variables, i.e. SNPs and expression levels as columns and samples as rows.

expr_trans = data.frame(t(expr[, -1]))
colnames(expr_trans)=t(expr[, 1])
expr_trans = tibble::rownames_to_column(expr_trans, "sample")
head(expr_trans)[1:10]
##    sample ENSG00000215270.3 ENSG00000237438.2 ENSG00000273203.1
## 1 HG00096         0.7878784        0.46099598        0.33084190
## 2 HG00097         1.6030633        1.29599155        0.64136258
## 3 HG00099         1.2839206        0.03093323        0.02358726
## 4 HG00100         1.8733944        0.93279608        0.11472217
## 5 HG00101         1.8611959        1.44133066        0.46404172
## 6 HG00102         0.5027009        0.12111466        0.21109161
##   ENSG00000273442.1 ENSG00000177663.9 ENSG00000183307.3 ENSG00000069998.8
## 1        0.15617620          3.064441        0.04658481         11.122808
## 2        0.29658085          2.758744        0.02211631         11.310690
## 3        0.00000000          3.167703        0.03985498         13.533813
## 4        0.08664862          2.225839        0.02584587         12.859737
## 5        0.27669990          3.434325        0.05502338          7.176259
## 6        0.09964725          2.151189        0.01486157          6.181117
##   ENSG00000093072.11 ENSG00000099954.14
## 1          11.195425        0.011522319
## 2           7.668832        0.047408942
## 3          11.774297        0.006571839
## 4           9.425467        0.000000000
## 5          12.055840        0.000000000
## 6          10.282059        0.007351744
# and the same for genotype data
snps_trans = data.frame(t(snps[-1]))
colnames(snps_trans)=t(snps[, 1])
snps_trans = tibble::rownames_to_column(snps_trans, "sample")
head(snps_trans)[1:10]
##    sample chr22_16051249 chr22_16052080 chr22_16052962 chr22_16052986
## 1 HG00096              0              1              0              0
## 2 HG00097              1              0              1              0
## 3 HG00099              1              0              1              0
## 4 HG00100              0              0              0              0
## 5 HG00101              0              0              0              0
## 6 HG00102              1              0              1              0
##   chr22_16053444 chr22_16053659 chr22_16053791 chr22_16053862
## 1              0              1              0              0
## 2              0              1              0              1
## 3              0              1              0              1
## 4              0              2              1              0
## 5              0              2              1              0
## 6              0              1              0              1
##   chr22_16053863
## 1              1
## 2              0
## 3              0
## 4              0
## 5              0
## 6              0

Another convenient way to display gene expression values by genotype is as box plots. These provide a good, nonparametric, indication of the distributions. To convey a sense of the frequency of each genotype in the sample it is useful to also add points for each individual to the plot. Below is an example of how this might look for first ten SNP/gene pairs. This time we’ll use R’s ggplot2 library to generate visualization.

library(ggplot2)

#Reshape dataframes a bit for use with ggplot2
snps_long = tidyr::gather(snps_trans[, 1:5], snp, genotype, -sample)
expr_long = tidyr::gather(expr_trans[, 1:5], gene, expression, -sample)
head(snps_long)
##    sample            snp genotype
## 1 HG00096 chr22_16051249        0
## 2 HG00097 chr22_16051249        1
## 3 HG00099 chr22_16051249        1
## 4 HG00100 chr22_16051249        0
## 5 HG00101 chr22_16051249        0
## 6 HG00102 chr22_16051249        1
head(expr_long)
##    sample              gene expression
## 1 HG00096 ENSG00000215270.3  0.7878784
## 2 HG00097 ENSG00000215270.3  1.6030633
## 3 HG00099 ENSG00000215270.3  1.2839206
## 4 HG00100 ENSG00000215270.3  1.8733944
## 5 HG00101 ENSG00000215270.3  1.8611959
## 6 HG00102 ENSG00000215270.3  0.5027009
data_long <- cbind(snps_long, expr_long["expression"])
data_long$genotype <- as.factor(data_long$genotype)
head(data_long)
##    sample            snp genotype expression
## 1 HG00096 chr22_16051249        0  0.7878784
## 2 HG00097 chr22_16051249        1  1.6030633
## 3 HG00099 chr22_16051249        1  1.2839206
## 4 HG00100 chr22_16051249        0  1.8733944
## 5 HG00101 chr22_16051249        0  1.8611959
## 6 HG00102 chr22_16051249        1  0.5027009
ggplot(data_long, aes(genotype, expression)) +
  geom_jitter(colour = "darkorange",alpha = 0.3, width = 0.02) +
  geom_boxplot(alpha = 0.5, fill = "steelblue") +
  facet_wrap(~snp) 

# Let's do that for more SNPs and genes, just to see whether we could catch any
# visible correlations

snps_long = tidyr::gather(snps_trans[, 1:10], snp, genotype, -sample)
expr_long = tidyr::gather(expr_trans[, 1:10], gene, expression, -sample)
head(snps_long)
##    sample            snp genotype
## 1 HG00096 chr22_16051249        0
## 2 HG00097 chr22_16051249        1
## 3 HG00099 chr22_16051249        1
## 4 HG00100 chr22_16051249        0
## 5 HG00101 chr22_16051249        0
## 6 HG00102 chr22_16051249        1
head(expr_long)
##    sample              gene expression
## 1 HG00096 ENSG00000215270.3  0.7878784
## 2 HG00097 ENSG00000215270.3  1.6030633
## 3 HG00099 ENSG00000215270.3  1.2839206
## 4 HG00100 ENSG00000215270.3  1.8733944
## 5 HG00101 ENSG00000215270.3  1.8611959
## 6 HG00102 ENSG00000215270.3  0.5027009
data_long <- cbind(snps_long, expr_long["expression"])
data_long$genotype <- as.factor(data_long$genotype)
head(data_long)
##    sample            snp genotype expression
## 1 HG00096 chr22_16051249        0  0.7878784
## 2 HG00097 chr22_16051249        1  1.6030633
## 3 HG00099 chr22_16051249        1  1.2839206
## 4 HG00100 chr22_16051249        0  1.8733944
## 5 HG00101 chr22_16051249        0  1.8611959
## 6 HG00102 chr22_16051249        1  0.5027009
ggplot(data_long, aes(genotype, expression)) +
  geom_jitter(colour = "darkorange",alpha = 0.3, width = 0.02) +
  geom_boxplot(alpha = 0.5, fill = "steelblue", 
               position = position_dodge(width = 0)) +
  facet_wrap(~snp) 

Understanding the basics

This chapter should explain the basic ideas behind eQTL analyses. What we are doing here is not what one would do to run an actual eQTL analysis. Here we’ll try to explain how eQTL mapping works in general.

Linear regression of genotype on phenotype

The most common way of estimating the effect of a SNP on gene expression is by performing a linear regression of sample genotypes on sample gene expression levels. So to obtain estimates of the genotypic contribution to gene expression we fit a simple linear regression model of the form \(E_i = \beta_0 + \beta G_i + \epsilon\), where \(E_i\) is the vector of gene expression values for gene \(i\) and \(G_i\) is the genotype vector for the SNP \(i\). We are interested in the estimate for \(\beta\) which indicates the change in gene expression for each copy of the second allele

The p-value indicates the significance of the genetic component in the model. Let’s try that for gene 10 with SNP 1 and SNP 6.

lm_1_10 = lm(expr_trans[, 11] ~ snps_trans[, 2])
summary(lm_1_10)
## 
## Call:
## lm(formula = expr_trans[, 11] ~ snps_trans[, 2])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.056455 -0.021535 -0.003053  0.017321  0.145656 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.056455   0.001792  31.510   <2e-16 ***
## snps_trans[, 2] -0.002931   0.003914  -0.749    0.455    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03094 on 356 degrees of freedom
## Multiple R-squared:  0.001572,   Adjusted R-squared:  -0.001232 
## F-statistic: 0.5605 on 1 and 356 DF,  p-value: 0.4545
lm_6_10 = lm(expr_trans[, 11] ~ snps_trans[, 7])
summary(lm_6_10)
## 
## Call:
## lm(formula = expr_trans[, 11] ~ snps_trans[, 7])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.056641 -0.022338 -0.003267  0.017837  0.146785 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.054012   0.004058   13.31   <2e-16 ***
## snps_trans[, 7] 0.001315   0.002577    0.51     0.61    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03095 on 356 degrees of freedom
## Multiple R-squared:  0.0007307,  Adjusted R-squared:  -0.002076 
## F-statistic: 0.2603 on 1 and 356 DF,  p-value: 0.6102

This is the standard summary output from R for linear regressions. Since we are interested in eQTLs our main interest lies in the second line of “Coefficients”. What is stated as “Estimate” is the slope of the linear regression, which in eQTL terms is called “effect size” or already mentioned “beta”. In eQTL studies one normally compares thousands of genes for which each hundreds to thousands of SNPs have been tested. The common way to identify eQTLs is by their p-value. The p-value given here as (Pr(>|t|)) will later be referred to as raw p-value. It can be calculated in many different ways, here it is based on the t-value which is derived from the estimation of the coefficient and its standard error. For a nice explanation of the summary(lm) output check here.

It is obvious that these two models don’t catch any significant effect of explored SNPs and gene expression levels. In fact it is very hard to catch such effects by “manually” exploring real-world genomic data, usually containing tens or hundreds of thousands SNPs and hundreds or thousands gene expressions for large number of samples.

Hence for the sake of making a comprehensable example we’ll partially repeat the previous EDA process with a dummy, i.e. simulated data set, which can be found here

gt = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated/sim_genotypes.tab", sep="\t",
header=TRUE, row.names = 1)
expr = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated/sim_expression1.tab", sep="\t",
header=TRUE, row.names = 1)

head(gt)
##          snp_1 snp_2 snp_3 snp_4 snp_5 snp_6 snp_7 snp_8 snp_9 snp_10
## sample_1     0     0     0     0     1     0     0     0     2      2
## sample_2     0     0     0     0     0     0     0     0     2      2
## sample_3     0     0     0     0     0     0     0     0     1      2
## sample_4     0     0     0     0     0     0     1     1     0      1
## sample_5     0     0     0     0     0     0     0     1     0      1
## sample_6     0     1     0     0     0     0     0     0     0      0
head(expr)
##            gene_1   gene_2   gene_3   gene_4   gene_5   gene_6   gene_7
## sample_1 6.091035 6.479307 6.702659 7.260538 8.196299 6.061722 6.548776
## sample_2 7.432563 7.016138 7.508599 7.295998 7.836607 7.825705 7.514788
## sample_3 6.917247 6.127383 6.097665 6.324987 6.017944 5.888059 6.912573
## sample_4 7.318114 6.791866 7.942633 6.894796 7.498120 7.199753 7.900366
## sample_5 7.288953 6.982799 8.120189 7.400018 7.401130 7.032545 7.413375
## sample_6 7.688428 8.420471 7.709514 8.422695 6.536603 8.340534 8.668130
##            gene_8    gene_9   gene_10
## sample_1 6.316635  9.646378  9.811172
## sample_2 8.359249 10.433948 10.455313
## sample_3 6.830853  8.312266  9.678442
## sample_4 8.651775  6.560307  9.509531
## sample_5 8.948665  7.482829  8.749063
## sample_6 7.582129  7.638559  8.011134
dim(gt)
## [1] 300  10
dim(expr)
## [1] 300  10
# Calculate MAF and draw MAF histogram
maf <- colMeans(gt)/2
maf
##       snp_1       snp_2       snp_3       snp_4       snp_5       snp_6 
## 0.006666667 0.030000000 0.020000000 0.065000000 0.046666667 0.126666667 
##       snp_7       snp_8       snp_9      snp_10 
## 0.171666667 0.298333333 0.390000000 0.511666667
maf <- pmin(maf, 1-maf)
maf
##       snp_1       snp_2       snp_3       snp_4       snp_5       snp_6 
## 0.006666667 0.030000000 0.020000000 0.065000000 0.046666667 0.126666667 
##       snp_7       snp_8       snp_9      snp_10 
## 0.171666667 0.298333333 0.390000000 0.488333333
sum(maf > 0.5)
## [1] 0
truehist(maf, main = "Histogram of MAF values.", col = "steelblue")
lines(density(maf), lty = 2, col = "darkorange", lwd = 3)

snps = c("snp_1", "snp_10", "snp_1")
genes = c("gene_1", "gene_10", "gene_10")
par(mfrow=c(1,length(snps)))

for (index in seq(length(snps))){
  genotype = gt[[snps[index]]]
  expression = expr[[genes[index]]]
  plot(jitter(genotype, factor = 0.4), expression,
  main=paste(snps[index], "vs", genes[index]), xlim= c(-0.5,2.5),
  xlab = "genotype", xaxt="n", col ="steelblue")
  axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
}

#Reshape dataframes a bit for use with ggplot2
genoLong = tidyr::gather(gt, snp, genotype)
exprLong = tidyr::gather(expr, gene, expression)
dataLong = cbind(genoLong, exprLong["expression"])
dataLong$genotype = as.factor(dataLong$genotype)

ggplot(dataLong, aes(genotype, expression)) +
    geom_jitter(colour = "darkorange",alpha = 0.3, width = 0.2) +
    geom_boxplot(alpha=0.6, fill="steelblue",
                 position=position_dodge()) +
    facet_wrap(~snp) 

Let’s again try linear regression for gene 10 with snp 1 and snp 10

lm_1_10 = lm(expr[, "gene_10"]~ gt[, "snp_1"])
summary(lm_1_10)
## 
## Call:
## lm(formula = expr[, "gene_10"] ~ gt[, "snp_1"])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5262 -1.1412  0.0577  1.1552  4.0071 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.55324    0.09138  93.600   <2e-16 ***
## gt[, "snp_1"] -1.07768    0.79138  -1.362    0.174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.572 on 298 degrees of freedom
## Multiple R-squared:  0.006185,   Adjusted R-squared:  0.00285 
## F-statistic: 1.854 on 1 and 298 DF,  p-value: 0.1743
lm_10_10 = lm(expr[, "gene_10"] ~ gt[,"snp_10"])
summary(lm_10_10)
## 
## Call:
## lm(formula = expr[, "gene_10"] ~ gt[, "snp_10"])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2863 -0.7161  0.0474  0.6658  2.4429 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.88488    0.10941   62.93   <2e-16 ***
## gt[, "snp_10"]  1.61627    0.08788   18.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.079 on 298 degrees of freedom
## Multiple R-squared:  0.5317, Adjusted R-squared:  0.5301 
## F-statistic: 338.3 on 1 and 298 DF,  p-value: < 2.2e-16

Let’s plot these regression lines over scatter plots from before:

snps = c("snp_1", "snp_10", "snp_1")
genes = c("gene_1", "gene_10", "gene_10")
par(mfrow=c(1,length(snps)))

for (index in seq(length(snps))){
  genotype = gt[[snps[index]]]
  expression = expr[[genes[index]]]
  lm_result = lm(expression ~ genotype)
  plot(jitter(genotype, factor = 0.4), expression,
  main=paste(snps[index], "vs", genes[index]), xlim= c(-0.5,2.5),
  xlab = "genotype", xaxt="n", col ="steelblue")
  abline(lm_result, col="darkorange")
  axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
  # Add p-values as text
  y_range = range(expression)
  text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
  format(summary(lm_result)$coefficients[2,4],
  scentific=TRUE, digits=2)))
  }

A nicer way to plot these results via ggplot2:

genoLong = tidyr::gather(gt, snp, genotype, snp_1, snp_10)
exprLong = tidyr::gather(expr, gene, expression, gene_1, gene_10)
dataLong = cbind(genoLong[,c("snp", "genotype")], exprLong[,c("gene", "expression")])
dataLong$comparison = paste(dataLong$snp, "vs", dataLong$gene)
dataLong$genotype = factor(dataLong$genotype)

ggplot(dataLong, aes(genotype, expression)) +
    geom_jitter(col="darkorange", position=position_jitter(width=0.25)) +
    geom_boxplot(outlier.size=0, alpha=0.6, fill="steelblue") +
    geom_smooth(method = 'lm',col="darkred", aes(group=1), se=FALSE) +
    facet_wrap(~comparison) 

Why should we care about MAF

fit <- mapply(function(e, g) lm(e ~ g), expr, gt, SIMPLIFY=FALSE)
betaHat <- sapply(fit, coef)[2,]
betaHat
##    gene_1    gene_2    gene_3    gene_4    gene_5    gene_6    gene_7 
## 0.9416734 1.3518024 1.1324493 1.5005237 1.4329068 1.2928385 1.5106722 
##    gene_8    gene_9   gene_10 
## 1.6050215 1.6038507 1.6162730

We’ll use the R’s function confint to obtain 95% confidence intervals of the estimated SNP effects.

ci <- sapply(fit, confint, "g")
rownames(ci) <- c("lower", "upper")
ci
##           gene_1    gene_2    gene_3   gene_4   gene_5   gene_6   gene_7
## lower -0.1043169 0.8457378 0.5723167 1.156449 1.017273 1.026215 1.286452
## upper  1.9876638 1.8578670 1.6925819 1.844598 1.848541 1.559462 1.734892
##         gene_8   gene_9  gene_10
## lower 1.412641 1.426662 1.443334
## upper 1.797402 1.781040 1.789212

Now let’s plot confidence intervals for \(\hat{\beta_i}\) in respect to MAFs.

estimates = data.frame(Estimate=betaHat, t(ci), MAF=maf)
ggplot(estimates, aes(x=MAF)) + 
  geom_hline(yintercept=1.5, colour = "darkorange") +
  geom_hline(yintercept=0, colour = "darkred", linetype="longdash") +
  geom_errorbar(aes(ymin=lower, ymax=upper), colour = "steelblue") +
  geom_point(aes(y=Estimate), colour = "steelblue") 

In this example all resulting confidence intervals include the true value but intervals for small minor allele frequencies are large (and in one case this means that 0 is included in the CI). As one would expect the uncertainty in the estimate, as measured by the width of the confidence interval, decreases with increasing minor allele frequency. However, even at high MAF considerable uncertainty remains and point estimates are somewhat lacking in accuracy, overestimating the true effect.

Covariates

Many different factors can affect gene expression, such as age, sex, smoking habits, genetic mutations or environmental factors, such as nutrition, etc. The more factors can be described in the model, the more accurate it will be and the higher are chances to find more subtle genetic effects. Covariates therefore are features of samples which may describe effects on gene expression. In technical terms one covariate is therefore a vector of the same length as there are samples, e.g.: age. The examples before worked nicely, because it was simulated data without any covariates. Now we will be using data where covariates have been modelled additionally. Let’s see how linear regression models behave depending on inclusion of covaraiates. We’ll calculate the linear regression with and without the covariates for combination of gene 10 with snp 10.

expr_cov = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated//sim_expression2.tab", sep="\t",
header=TRUE, row.names = 1)
covariates = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated//sim_covariates.tab", sep="\t",
header=TRUE, row.names = 1)

head(expr_cov, 3)
##             gene_1    gene_2    gene_3    gene_4    gene_5   gene_6
## sample_1 11.950277 12.338549 12.561901 13.119780 14.055542 11.92096
## sample_2 19.742334 19.325909 19.818370 19.605769 20.146379 20.13548
## sample_3 -1.509752 -2.299616 -2.329334 -2.102012 -2.409055 -2.53894
##             gene_7    gene_8     gene_9   gene_10
## sample_1 12.408018 12.175877 15.5056204 15.670415
## sample_2 19.824559 20.669020 22.7437190 22.765084
## sample_3 -1.514426 -1.596146 -0.1147331  1.251443
head(covariates, 3)
##               var_1       var_2     var_3     var_4      var_5      var_6
## sample_1  0.1805229  0.65730433 2.1202020 0.7086180  0.2261268 -1.3233594
## sample_2  0.7847340  1.03631767 0.3571627 0.8541129 -0.3856621  1.0153862
## sample_3 -1.3531646 -0.04268473 0.4329161 0.1914436 -1.0722882  0.1656182
##               var_7      var_8      var_9    var_10      var_11    var_12
## sample_1  0.0693342 -1.2700925  1.9314654 0.1893478 -0.02579421 1.4651498
## sample_2  0.9291073  0.7901720  0.7504875 0.1566840  0.46778575 0.4504485
## sample_3 -0.9194574 -0.1477517 -2.5731055 0.2177240  0.26250697 0.4895526
##              var_13     var_14     var_15     var_16     var_17     var_18
## sample_1 -1.7309505 -0.8570105  0.3502420 -0.5500488  0.5100173 -1.7280022
## sample_2  0.2793073 -0.4621151  0.8739229 -0.9148169  1.0457920 -1.0324303
## sample_3  1.1093786  0.1780986 -0.7811387  0.4241464 -0.4223912  0.1432051
##              var_19      var_20
## sample_1  0.6182252 -0.09250967
## sample_2 -3.0233684 -0.16821972
## sample_3 -1.5812401  0.91562847
dim(expr_cov)
## [1] 300  10
dim(covariates)
## [1] 300  20
lm_10_10 = lm(expr_cov[,"gene_10"] ~ gt[,"snp_10"])
summary(lm_10_10)
## 
## Call:
## lm(formula = expr_cov[, "gene_10"] ~ gt[, "snp_10"])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.8886  -4.4990  -0.2714   5.1006  20.2046 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.8321     0.7492   7.784 1.16e-13 ***
## gt[, "snp_10"]   2.9265     0.6018   4.863 1.87e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.391 on 298 degrees of freedom
## Multiple R-squared:  0.07353,    Adjusted R-squared:  0.07042 
## F-statistic: 23.65 on 1 and 298 DF,  p-value: 1.873e-06
lm_10_10_covs = lm(expr_cov[,"gene_10"] ~ gt[,"snp_10"] + as.matrix(covariates))
summary(lm_10_10_covs)
## 
## Call:
## lm(formula = expr_cov[, "gene_10"] ~ gt[, "snp_10"] + as.matrix(covariates))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4494 -0.6804  0.0657  0.6722  2.5683 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  6.82775    0.11393  59.928  < 2e-16 ***
## gt[, "snp_10"]               1.65776    0.09172  18.074  < 2e-16 ***
## as.matrix(covariates)var_1   2.50006    0.06416  38.967  < 2e-16 ***
## as.matrix(covariates)var_2   2.24993    0.07045  31.935  < 2e-16 ***
## as.matrix(covariates)var_3   2.21608    0.06487  34.163  < 2e-16 ***
## as.matrix(covariates)var_4   2.20670    0.06667  33.097  < 2e-16 ***
## as.matrix(covariates)var_5   2.18179    0.06466  33.740  < 2e-16 ***
## as.matrix(covariates)var_6   1.90274    0.06336  30.031  < 2e-16 ***
## as.matrix(covariates)var_7   1.87298    0.06779  27.631  < 2e-16 ***
## as.matrix(covariates)var_8   1.67675    0.06664  25.163  < 2e-16 ***
## as.matrix(covariates)var_9   1.64621    0.06771  24.311  < 2e-16 ***
## as.matrix(covariates)var_10  1.55134    0.06076  25.531  < 2e-16 ***
## as.matrix(covariates)var_11  1.48276    0.06598  22.475  < 2e-16 ***
## as.matrix(covariates)var_12  1.31934    0.06802  19.396  < 2e-16 ***
## as.matrix(covariates)var_13  1.23396    0.06564  18.799  < 2e-16 ***
## as.matrix(covariates)var_14  1.14690    0.06383  17.967  < 2e-16 ***
## as.matrix(covariates)var_15  1.13261    0.06344  17.854  < 2e-16 ***
## as.matrix(covariates)var_16  0.96792    0.06647  14.562  < 2e-16 ***
## as.matrix(covariates)var_17  0.75899    0.06875  11.039  < 2e-16 ***
## as.matrix(covariates)var_18  0.70516    0.06483  10.876  < 2e-16 ***
## as.matrix(covariates)var_19  0.64040    0.06584   9.727  < 2e-16 ***
## as.matrix(covariates)var_20  0.50829    0.06822   7.450 1.18e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.084 on 278 degrees of freedom
## Multiple R-squared:  0.9814, Adjusted R-squared:   0.98 
## F-statistic: 699.3 on 21 and 278 DF,  p-value: < 2.2e-16

Let’s plot the results:

par(mfrow=c(1,2))

plot(jitter(gt[,"snp_10"], factor = 0.4), expr_cov[,"gene_10"],
  main="gene_10 vs snp_10", xlim= c(-0.5,2.5),
  xlab = "genotype", xaxt="n", col ="steelblue")
  abline(lm_10_10, col="darkorange", lwd = 2, lty = 2)
  axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
  # Add p-values as text
  y_range = range(expr_cov[,"gene_10"])
  text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
  format(summary(lm_10_10)$coefficients[2,4],
  scentific=TRUE, digits=2)), col = "darkorange")
  
plot(jitter(gt[,"snp_10"], factor = 0.4), expr_cov[,"gene_10"],
  main="gene_10 vs snp_10 incl. cov.", xlim= c(-0.5,2.5),
  xlab = "genotype", xaxt="n", col ="steelblue")
  abline(lm_10_10_covs, col="darkorange", lwd = 2, lty = 2)
## Warning in abline(lm_10_10_covs, col = "darkorange", lwd = 2, lty = 2):
## only using the first two of 22 regression coefficients
  axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
  # Add p-values as text
  y_range = range(expr_cov[,"gene_10"])
  text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
  format(summary(lm_10_10_covs)$coefficients[2,4],
  scentific=TRUE, digits=2)), col = "darkorange")

Using principle components as covariates

Further we will explore the use of principle components as covariates in linear models of gene expression to account for unknown sources of variation. Let’s use the data set from the previous exercise and include PC’s, as covariates, in the linear model instead of known covariates we used previously.

Check the data:

head(expr_cov, 3)
##             gene_1    gene_2    gene_3    gene_4    gene_5   gene_6
## sample_1 11.950277 12.338549 12.561901 13.119780 14.055542 11.92096
## sample_2 19.742334 19.325909 19.818370 19.605769 20.146379 20.13548
## sample_3 -1.509752 -2.299616 -2.329334 -2.102012 -2.409055 -2.53894
##             gene_7    gene_8     gene_9   gene_10
## sample_1 12.408018 12.175877 15.5056204 15.670415
## sample_2 19.824559 20.669020 22.7437190 22.765084
## sample_3 -1.514426 -1.596146 -0.1147331  1.251443
dim(expr_cov)
## [1] 300  10

R provides the function prcomp for computing PCs. Like most standard R functions it expects data to be laid out with variables in columns and samples in rows, so we are good to go.

pca <- prcomp(expr_cov, center=TRUE, scale = TRUE)
head(pca$x, 3)
##                PC1         PC2       PC3         PC4        PC5
## sample_1 -2.276793 -0.06354526 0.3304429  0.11776561 0.07185696
## sample_2 -5.358711 -0.11632936 0.2518289 -0.02015748 0.06394395
## sample_3  3.919988  0.06913189 0.2497241 -0.06144515 0.10165365
##                   PC6          PC7         PC8         PC9        PC10
## sample_1  0.202507384 -0.038418155 -0.10083580 -0.08100452  0.06052746
## sample_2 -0.004416506 -0.001257329 -0.03820300 -0.08312656 -0.02748060
## sample_3  0.058176697 -0.001891389  0.09413981 -0.03593804 -0.09582196

As you can see, since we have 10 variables, we got 10 PCs in total. Let’s plot percentage of explained variance for all PCs.

library(factoextra)
plot(pca)

fviz_eig(pca, addlabels = TRUE)

The variance accounted for by each component is available through the sdev field of the prcomp return value:

pca$sdev
##  [1] 3.14394553 0.15615397 0.14470210 0.13538318 0.12397628 0.10219570
##  [7] 0.08938052 0.08844315 0.07362859 0.07006292
summary(pca)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.1439 0.15615 0.14470 0.13538 0.12398 0.10220
## Proportion of Variance 0.9884 0.00244 0.00209 0.00183 0.00154 0.00104
## Cumulative Proportion  0.9884 0.99088 0.99297 0.99480 0.99634 0.99739
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.08938 0.08844 0.07363 0.07006
## Proportion of Variance 0.00080 0.00078 0.00054 0.00049
## Cumulative Proportion  0.99818 0.99897 0.99951 1.00000
# or
sum(pca$sdev[1:5]^2)/sum(pca$sdev^2) # percentage of variace explained by first 5 PCs
## [1] 0.9963415
pca$sdev[1]^2/sum(pca$sdev^2) # percentage of variance explained by the 1st PC
## [1] 0.9884394

Since this is specifically modeled data, the first PC accounts for major part of variance. Let’s do modeling and see how the model behaves depending on inclusion of PCs.

# no PCs included
lm_9_9 = lm(expr_cov[,"gene_9"] ~ gt[,"snp_9"])
summary(lm_9_9)
## 
## Call:
## lm(formula = expr_cov[, "gene_9"] ~ gt[, "snp_9"])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.9702  -4.2002   0.0046   4.9352  20.0212 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.3809     0.6442  11.457   <2e-16 ***
## gt[, "snp_9"]   1.3727     0.6142   2.235   0.0262 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.459 on 298 degrees of freedom
## Multiple R-squared:  0.01648,    Adjusted R-squared:  0.01318 
## F-statistic: 4.994 on 1 and 298 DF,  p-value: 0.02617
# let's now include the 1st PC
lm_9_9_PC1 = lm(expr_cov[,"gene_9"] ~ gt[,"snp_9"] + as.numeric(pca$x[,1]))
summary(lm_9_9_PC1)
## 
## Call:
## lm(formula = expr_cov[, "gene_9"] ~ gt[, "snp_9"] + as.numeric(pca$x[, 
##     1]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36081 -0.34148 -0.02407  0.33541  1.39800 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             7.410309   0.043561  170.11   <2e-16 ***
## gt[, "snp_9"]           1.334989   0.041534   32.14   <2e-16 ***
## as.numeric(pca$x[, 1]) -2.363213   0.009278 -254.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5044 on 297 degrees of freedom
## Multiple R-squared:  0.9955, Adjusted R-squared:  0.9955 
## F-statistic: 3.298e+04 on 2 and 297 DF,  p-value: < 2.2e-16

Let’s again plot the results:

par(mfrow=c(1,2))

plot(jitter(gt[,"snp_9"], factor = 0.4), expr_cov[,"gene_9"],
  main="gene_9 vs snp_9", xlim= c(-0.5,2.5),
  xlab = "genotype", xaxt="n", col ="steelblue")
  abline(lm_9_9, col="darkorange", lwd = 2, lty = 2)
  axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
  # Add p-values as text
  y_range = range(expr_cov[,"gene_9"])
  text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
  format(summary(lm_9_9)$coefficients[2,4],
  scentific=TRUE, digits=2)), col = "darkorange")
  
plot(jitter(gt[,"snp_9"], factor = 0.4), expr_cov[,"gene_9"],
  main="gene_9 vs snp_9 incl. PC1", xlim= c(-0.5,2.5),
  xlab = "genotype", xaxt="n", col ="steelblue")
  abline(lm_9_9_PC1, col="darkorange", lwd = 2, lty = 2)
## Warning in abline(lm_9_9_PC1, col = "darkorange", lwd = 2, lty = 2): only
## using the first two of 3 regression coefficients
  axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
  # Add p-values as text
  y_range = range(expr_cov[,"gene_9"])
  text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
  format(summary(lm_9_9_PC1)$coefficients[2,4],
  scentific=TRUE, digits=2)), col = "darkorange")

Large scale eQTL analysis

Now that we’ve covered the basics let’s focus on real world expression and genotyping data. We’ll be using R’s MatrixEQTL package which is designed for fast eQTL analysis on large genomic data sets. MatrixEQTL can test for association between genotype and gene expression using linear regression with either additive or ANOVA genotype effects. The models can include covariates to account for factors as population stratification, gender, and clinical variables. It also supports models with heteroscedastic and/or correlated errors, false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. You can learn more about MatrixEQTL here.

Importing and preprocessing genotype and expression data

Genotype and expression data come in all sorts of flavours and preprocessing of this data needs to be done rather carefully. In this short introduction it is not possible to cover this topic in detail. Common input formats for genotypes are VCF, PLINK files, or even other custom files which give the genotype of each sample on all queried genomic positions. Expression data can also be made available in various formats depending on the underlying technology (RNAseq or expression micro array). Good portion of publicly available eQTL data sets are, still, from microarray experiments.

cis-eQTL and trans-eQTL analysis

To perform an eQTL analysis we don’t only need to know the genotype and gene expression values for every sample, but also the genomic positions of genes and SNPs. This is necessary to define which SNPs should be tested against which genes. For cis-eQTL analyses SNPs in proximity to the gene are chosen and for trans-eQTL analyses SNPs further away, or on different chromosomes, are taken into account. The window in cis-eQTL analysis is commonly chosen to be 500kb-1Mb measured from gene’s TSS.

Example

Now let’s go through an example of cis- trans-eQTL mapping with chr22_GEUVADIS_358_continental data.

# eQTL mapping, cis/trans, no pcs
  suppressMessages(library(MatrixEQTL))

  
  SNP_file_name <-  "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/SNP_voja.txt";
  snps_location_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/snpsloc_voja.txt";
  expression_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/GE_voja.txt";
  gene_location_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/geneloc_voja.txt";
  covariates_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/cov_voja.txt";
  cis_threshold <- 1e-5
  trans_threshold <- 1e-5
  cis_dist <- 1e6
  
  
  # Output file name
  output_file_name_cis = tempfile();
  output_file_name_tra = tempfile();
  # output_file_name_cis = "/Users/igorhut/Documents/eQTL/tools_comparison_R_proj/output_voja_cis_eqtls_1.txt";
  # output_file_name_tra = "/Users/igorhut/Documents/eQTL/tools_comparison_R_proj/output_voja_trans_eqtls_1.txt";
  
  ## Settings
  # Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS
  useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

  # Only associations significant at this level will be saved
  pvOutputThreshold_cis = cis_threshold;
  pvOutputThreshold_tra = trans_threshold;
  
  # Set to character() for no covariates
  # covariates_file_name = character();
  
  # Error covariance matrix
  # Set to numeric() for identity.
  errorCovariance = numeric();
  
  # Distance for local gene-SNP pairs
  cisDist = cis_dist
  
  ## Load genotype data
  snps = SlicedData$new();
  snps$fileDelimiter = "\t"; # the TAB character
  snps$fileOmitCharacters = "NA"; # denote missing values;
  snps$fileSkipRows = 1; # one row of column labels
  snps$fileSkipColumns = 1; # one column of row labels
  snps$fileSliceSize = 2000; # read file in slices of 2,000 rows
  snps$LoadFile(SNP_file_name);
  
  ## Load gene expression data
  gene = SlicedData$new();
  gene$fileDelimiter = "\t"; # the TAB character
  gene$fileOmitCharacters = "NA"; # denote missing values;
  gene$fileSkipRows = 1; # one row of column labels
  gene$fileSkipColumns = 1; # one column of row labels
  gene$fileSliceSize = 2000; # read file in slices of 2,000 rows
  gene$LoadFile(expression_file_name);
  
  ## Load covariates
  cvrt = SlicedData$new();
  cvrt$fileDelimiter = "\t"; # the TAB character
  cvrt$fileOmitCharacters = "NA"; # denote missing values;
  cvrt$fileSkipRows = 1; # one row of column labels
  cvrt$fileSkipColumns = 1; # one column of row labels
  if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name);
  }
  
  ## Run the analysis
  snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
  genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
  
me = Matrix_eQTL_main(
  snps = snps,
  gene = gene,
  cvrt = cvrt,
  output_file_name = output_file_name_tra,
  pvOutputThreshold = pvOutputThreshold_tra,
  useModel = useModel,
  errorCovariance = errorCovariance,
  verbose = FALSE,
  output_file_name.cis = output_file_name_cis,
  pvOutputThreshold.cis = pvOutputThreshold_cis,
  snpspos = snpspos,
  genepos = genepos,
  cisDist = cisDist,
  pvalue.hist = TRUE,
  min.pv.by.genesnp = FALSE,
  noFDRsaveMemory = FALSE);
  
  me_qq = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name_tra,
    pvOutputThreshold = pvOutputThreshold_tra,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = FALSE,
    output_file_name.cis = output_file_name_cis,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snpspos,
    genepos = genepos,
    cisDist = cisDist,
    pvalue.hist = "qqplot",
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)
  
  unlink(output_file_name_tra);
  unlink(output_file_name_cis);
  
  cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');
## Analysis done in:  21.943  seconds
  head(me$cis$eqtls)
##             snps              gene statistic       pvalue          FDR
## 1 chr22_41256802 ENSG00000172404.4  31.85769 9.838311e-99 2.697776e-92
## 2 chr22_41272143 ENSG00000172404.4  30.00462 9.694354e-93 1.329151e-86
## 3 chr22_41317083 ENSG00000172404.4 -29.79930 4.593491e-92 3.148968e-86
## 4 chr22_41339367 ENSG00000172404.4 -29.79930 4.593491e-92 3.148968e-86
## 5 chr22_41217786 ENSG00000172404.4  29.73128 7.699892e-92 4.222795e-86
## 6 chr22_41353455 ENSG00000172404.4 -29.58144 2.407304e-91 1.100183e-85
##         beta
## 1  0.3185012
## 2  0.3081578
## 3 -0.3086846
## 4 -0.3086846
## 5  0.3079505
## 6 -0.3074922
  head(me$trans$eqtls)
##             snps               gene  statistic       pvalue          FDR
## 1 chr22_21722525  ENSG00000215513.7 -13.268920 5.428888e-32 2.672792e-24
## 2 chr22_18727810  ENSG00000215513.7   9.899769 3.340484e-20 8.223064e-13
## 3 chr22_21722525  ENSG00000206176.5  -9.647465 2.219881e-19 3.643029e-12
## 4 chr22_20400371 ENSG00000183506.12   8.409424 1.638824e-15 1.833918e-08
## 5 chr22_18727198  ENSG00000215513.7   8.364461 2.234997e-15 1.833918e-08
## 6 chr22_18727222  ENSG00000215513.7   8.364461 2.234997e-15 1.833918e-08
##          beta
## 1 -0.56291102
## 2  0.32524771
## 3 -0.22652690
## 4  0.05278156
## 5  0.27834246
## 6  0.27834246
  ## Make the histogram of local and distant p-values
  plot(me)

  ## Make the qq-plot of local and distant p-values
  plot(me_qq)  

Multiple testing correction

Whenever multiple statistical tests are performed, a multiple testing correction has to be performed. This is necessary because many hypotheses are tested. Therefore each calculated association p-value has to be corrected for multiple testing. MatrixEQTL does this for you automatically and returns the corrected p-value as a false discovery rate (FDR). Common thresholds on FDR are 5% or 10%.

Interpreting eQTL results

Linkage disequilibrium (LD)

Linkage disequilibrium (LD) is a very important effect that plays a big role in genetic association studies. It describes the fact that genetic variants are not always inherited independently due to recombination patterns during reproduction. SNPs in LD are inherited in similar patterns and therefore can explain gene expression in similar ways. This means that LD makes it harder for association studies to identify one single SNP being associated with altered gene expression. Also it is possible that the combination of SNPs (as a haplotype) causes differences in gene expression and not only one single SNP. Watch this video which explains the basics of LD.

Selecting eQTLs

Commonly one selects at most one associated SNP per gene. If there are many SNPs associated with a gene it is most likely that those SNPs are highly linked to each other (“in high LD”) and therefore they describe the same effect. There are still cases in which genes are regulated by different SNPs independently, this cannot be readily determined from the table produced by MatrixEQTL. Here we will not try to identify the independent lead eQTL signals.

Example

Let’s indentify which SNPs are (significantly) associated with which genes at a maximum FDR of 5% from the cis-eQTL results. We’ll print a table in which only the lead SNP per gene is given. Also we’ll add the MAF for every SNP in the table.

library(dplyr)

snp_values = read.table(SNP_file_name, row.names=1, header=TRUE)
snp_values = data.frame(snps = rownames(snp_values), snp_values, stringsAsFactors = FALSE)

top_eqtls = filter(me$cis$eqtls, FDR <= 0.05) %>% 
  arrange(FDR) %>% 
  distinct(gene, .keep_all = TRUE)

mafs = apply(as.matrix(snp_values[-1]),1,mean)/2
mafs = pmin(mafs, 1 - mafs)
mafs = data.frame(snps=names(mafs), maf = mafs)
top_eqtls = left_join(top_eqtls, mafs, by="snps")
head(top_eqtls)
##             snps              gene statistic       pvalue          FDR
## 1 chr22_41256802 ENSG00000172404.4  31.85769 9.838311e-99 2.697776e-92
## 2 chr22_45726345 ENSG00000100376.7 -28.29562 4.787518e-87 1.875417e-81
## 3 chr22_42465260 ENSG00000183172.8  27.96028 6.544805e-86 9.445576e-81
## 4 chr22_26921725 ENSG00000260065.1  27.92949 8.327123e-86 9.788951e-81
## 5 chr22_24334948 ENSG00000184674.8 -27.29725 1.200698e-83 4.988560e-79
## 6 chr22_51001271 ENSG00000217442.3  25.63617 7.118432e-78 1.384365e-73
##         beta        maf
## 1  0.3185012 0.43435754
## 2 -3.5781293 0.13268156
## 3  4.0907715 0.31703911
## 4  0.4825401 0.47206704
## 5 -2.5667483 0.36312849
## 6  0.9441055 0.06284916

Whenever multiple statistical tests are performed, a multiple testing correction needs to be performed. This is necessary in order to compensate for the errors which arise from multiple hypothesis testing. Therefore each calculated association p-value has to be corrected. MatrixEQTL does this for you automatically and returns the corrected p-value as a false discovery rate (FDR). Common thresholds on FDR are 5% or 10%.

Presenting eQTL analysis results

There are a few standard plots which are common in eQTL analyses. We already produced one of them earlier when we plotted gene expression versus genotype. This gives a visual insight in how clear the data was and what the linear regression actually detected.

Gene expression vs. genotype for the eqtl with the lowest association p-value

Let’s now plot gene expression vs. genotype for the eqtl with the lowest association FDR-value. Also we will add linear regression line for this isolated case.

# For this we also need df with expression data
gene_values = read.table(expression_file_name, row.names=1, header=TRUE)
gene_values = data.frame(gene = rownames(gene_values), gene_values, stringsAsFactors = FALSE)

top_snp = top_eqtls$snps[1]
top_gene = as.character(top_eqtls$gene[1])

top_snp_data = filter(snp_values, snps == top_snp)
top_gene_data = filter(gene_values, gene == top_gene)

plot_data = t(bind_rows(top_snp_data[-1], top_gene_data[-1]))
colnames(plot_data) = c("snp", "gene_expr")
plot_data = as.data.frame(plot_data)
plot_data$snp = as.factor(plot_data$snp)
head(plot_data)
##         snp gene_expr
## HG00096   0 0.2518932
## HG00097   1 0.5218350
## HG00099   2 0.8620147
## HG00100   1 0.5844244
## HG00101   1 0.5139019
## HG00102   2 0.9935348
lm_top = lm(plot_data[,"gene_expr"] ~ as.numeric(plot_data[,"snp"]))
summary(lm_top)
## 
## Call:
## lm(formula = plot_data[, "gene_expr"] ~ as.numeric(plot_data[, 
##     "snp"]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46029 -0.09368 -0.00761  0.08633  0.49462 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.10229    0.02460  -4.158 4.02e-05 ***
## as.numeric(plot_data[, "snp"])  0.30662    0.01096  27.974  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1458 on 356 degrees of freedom
## Multiple R-squared:  0.6873, Adjusted R-squared:  0.6864 
## F-statistic: 782.6 on 1 and 356 DF,  p-value: < 2.2e-16
plot(plot_data, col="steelblue", 
     main = paste0(top_gene, " vs ", top_snp))
abline(lm_top, col="darkorange", lwd = 2, lty = 2)
y_range = range(plot_data[,"gene_expr"])
  text(x=2, y=y_range[1] + 0.5*diff(y_range), paste0("p=",
  format(summary(lm_top)$coefficients[2,4],
  scentific=TRUE, digits=2)), col = "darkorange")

Manhanttan plots

Manhattan plots are a way to depict association p-values of multiple SNPs at once. They are also very common in GWAS. Manhattan plots are a rather convinient modality for interpretation of eQTL signals in terms of LD.

Let’s now generate a manhattan plot for gene ENSG00000172404.4, plotting the base-pair position on the x-axis and the \(-log_{10}(pvalue)\) of the SNP in the y axis. Manhattan plots usually depict all tested SNPs, not only the ones passing a certain p-value threshold. Therefore we’ll first obtain all the association p-values for all tested SNPs for gene ENSG00000172404.4.

gene_id = "ENSG00000172404.4"

gene_values = read.table(expression_file_name, row.names=1, header=TRUE)
single_gene_exp = SlicedData$new()
single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id, , drop=FALSE]))
single_gene_exp
## SlicedData object. For more information type: ?SlicedData
## Number of columns: 358 
## Number of rows: 1 
## Data is stored in 1 slices
## Top left corner of the first slice (up to 10x10):
##                     HG00096  HG00097   HG00099   HG00100   HG00101
## ENSG00000172404.4 0.2518932 0.521835 0.8620147 0.5844244 0.5139019
##                     HG00102   HG00103  HG00105   HG00106   HG00108
## ENSG00000172404.4 0.9935348 0.7540972 1.102222 0.5453733 0.1863607
snpspos = read.table(snps_location_file_name, 
                     header = TRUE, 
                     stringsAsFactors = FALSE)

genepos = read.table(gene_location_file_name, 
                     header = TRUE, 
                     stringsAsFactors = FALSE)

single_cis_eqtl_res = Matrix_eQTL_main(snps, 
                                       single_gene_exp,
                                       verbose = FALSE,
                                       output_file_name.cis = NULL,
                                       output_file_name = NULL,
                                       pvOutputThreshold.cis=1, 
                                       snpspos = snpspos,
                                       genepos = genepos)

manh_data = merge(single_cis_eqtl_res$cis$eqtls, snpspos, by.x = "snps", by.y = "snipid")
manh_data = manh_data [,c("pos", "chr", "pvalue", "snps")]
head(manh_data)
##        pos   chr    pvalue           snps
## 1 40259861 chr22 0.9238947 chr22_40259861
## 2 40262482 chr22 0.7047404 chr22_40262482
## 3 40263359 chr22 0.6800275 chr22_40263359
## 4 40263997 chr22 0.7054801 chr22_40263997
## 5 40264551 chr22 0.7047404 chr22_40264551
## 6 40264584 chr22 0.9238947 chr22_40264584
# Plot the Manhattanplot
with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)",
main=paste(gene_id, "associated SNPs")))
# Highlight the lead SNP
with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] ,
points(pos, -log10(pvalue), pch=20, col="red"))
# Add a label to the lead SNP
with(manh_data[which.min(manh_data$pvalue),,drop=FALSE],
text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))

Now produce manhattan plots for the top 3 cis-eQTL results and for the bottom 3 cis-eQTL in the top_eqtls matrix we have created earlier.

for (gene_id in top_eqtls$gene[c(1:3,(nrow(top_eqtls)-3):nrow(top_eqtls))]){
  print(gene_id)
  single_gene_exp = SlicedData$new()
  single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id,, drop=FALSE]))
  
  single_cis_eqtl_res = Matrix_eQTL_main(snps, single_gene_exp,
      output_file_name.cis = NULL,
      output_file_name = NULL,
      pvOutputThreshold.cis=1,
      verbose = FALSE,
      snpspos=snpspos,
      genepos=genepos)
  
  manh_data = merge(single_cis_eqtl_res$cis$eqtl, 
                    snpspos, 
                    by.x="snps", 
                    by.y = "snipid")
  manh_data =manh_data[,c("pos", "chr", "pvalue", "snps")]
  par(mfrow=c(1,1))
  # Plot the Manhattanplot
  with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)",
  main=paste(gene_id, "associated SNPs")))
  # Highlight the lead SNP
  with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] ,
  points(pos, -log10(pvalue), pch=20, col="red"))
  # Add a label to the lead SNP
  with(manh_data[which.min(manh_data$pvalue),,drop=FALSE],
  text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))
  scan(stdin())
}
## [1] "ENSG00000172404.4"

## [1] "ENSG00000100376.7"

## [1] "ENSG00000183172.8"

## [1] "ENSG00000273203.1"

## [1] "ENSG00000236850.4"

## [1] "ENSG00000182541.13"

## [1] "ENSG00000183785.10"

Mostly there is a very clear eQTL signal visible, i.e. a distinct peak. Variants which are similar in “height” as the lead cis-eQTL SNP, but lower are most likely SNPs in LD with the lead SNP. In some cases horizontal lines (chunks) become visible which means that those variants are in very high LD among each other, i.e. they are usually inherited together. When variants are in very high LD (horizontal lines) their importance for gene expression cannot be distinguished. Other methods such as fine mapping try to use information like genome segmentation to break the LD blocks into smaller fractions of being more or less likely causal.

eQTL SNP distance from the TSS

Usually cis-eQTL SNPs are located around the transcription starting site (TSS) of the associated gene. Depending on the dataset there may be a slight bias to more associations upstream the TSS. When looking at the SNP positions relative to the TSS one has to take the strand of the gene into account, as up- and downstream are always relative to the strand the gene lies on.

Distance distribution of cis−eQTLs.

Distance distribution of cis−eQTLs.

Other (e)QTL calling algorithms