Genomic data set - chr22_GEUVADIS_358_continental
library(readr)
library(tidyr)
library(dplyr)
snps = read_tsv("/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/SNP_voja.txt")
expr = read_tsv("/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/GE_voja.txt")
To get a basic idea of how many samples, genes and which genotyped positions are available in the data set, we’ll print out the “head” and dimensions of the loaded data sets. For the first 10 genes we’ll print out the mean expression levels + standard deviation. The same goes for the first 10 SNPs in the snps data frame.
head(expr)[1:12]
## # A tibble: 6 x 12
## geneid HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG000… 0.788 1.60 1.28 1.87 1.86 0.503 1.78 1.86
## 2 ENSG000… 0.461 1.30 0.0309 0.933 1.44 0.121 1.84 0.358
## 3 ENSG000… 0.331 0.641 0.0236 0.115 0.464 0.211 0.844 0.164
## 4 ENSG000… 0.156 0.297 0 0.0866 0.277 0.0996 0.823 0
## 5 ENSG000… 3.06 2.76 3.17 2.23 3.43 2.15 2.88 3.14
## 6 ENSG000… 0.0466 0.0221 0.0399 0.0258 0.0550 0.0149 0.0307 0.0461
## # ... with 3 more variables: HG00106 <dbl>, HG00108 <dbl>, HG00109 <dbl>
dim(expr)
## [1] 608 359
head(snps)[1:12]
## # A tibble: 6 x 12
## snipid HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105
## <chr> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 chr22_1… 0 1 1 0 0 1 0 0
## 2 chr22_1… 1 0 0 0 0 0 1 0
## 3 chr22_1… 0 1 1 0 0 1 0 0
## 4 chr22_1… 0 0 0 0 0 0 0 0
## 5 chr22_1… 0 0 0 0 0 0 0 0
## 6 chr22_1… 1 1 1 2 2 1 1 1
## # ... with 3 more variables: HG00106 <int>, HG00108 <int>, HG00109 <int>
dim(snps)
## [1] 85485 359
paste("Mean value of expression for gene ",expr$geneid[1:10]," is ", rowMeans(expr[1:10, -1]))
## [1] "Mean value of expression for gene ENSG00000215270.3 is 1.37195382262709"
## [2] "Mean value of expression for gene ENSG00000237438.2 is 1.47948167480269"
## [3] "Mean value of expression for gene ENSG00000273203.1 is 0.658849616400615"
## [4] "Mean value of expression for gene ENSG00000273442.1 is 0.30150779514162"
## [5] "Mean value of expression for gene ENSG00000177663.9 is 2.76893802934637"
## [6] "Mean value of expression for gene ENSG00000183307.3 is 0.0330109958248743"
## [7] "Mean value of expression for gene ENSG00000069998.8 is 12.5103110517179"
## [8] "Mean value of expression for gene ENSG00000093072.11 is 9.5873076093771"
## [9] "Mean value of expression for gene ENSG00000099954.14 is 0.0257808687212151"
## [10] "Mean value of expression for gene ENSG00000182902.9 is 0.0559065276387877"
paste("Standard deviation of expression for gene ", expr$geneid[1:10]," is ", t(apply(expr[1:10, -1], 1, sd)))
## [1] "Standard deviation of expression for gene ENSG00000215270.3 is 0.472651252805272"
## [2] "Standard deviation of expression for gene ENSG00000237438.2 is 1.20578650765609"
## [3] "Standard deviation of expression for gene ENSG00000273203.1 is 0.516317432602619"
## [4] "Standard deviation of expression for gene ENSG00000273442.1 is 0.30999282888107"
## [5] "Standard deviation of expression for gene ENSG00000177663.9 is 0.663435616569955"
## [6] "Standard deviation of expression for gene ENSG00000183307.3 is 0.022646815889866"
## [7] "Standard deviation of expression for gene ENSG00000069998.8 is 2.92736948283107"
## [8] "Standard deviation of expression for gene ENSG00000093072.11 is 5.07079616962829"
## [9] "Standard deviation of expression for gene ENSG00000099954.14 is 0.0384475820388767"
## [10] "Standard deviation of expression for gene ENSG00000182902.9 is 0.0309177015985779"
paste("Mean value for SNP ",snps$snipid[1:10]," is ", rowMeans(snps[1:10, -1]))
## [1] "Mean value for SNP chr22_16051249 is 0.187150837988827"
## [2] "Mean value for SNP chr22_16052080 is 0.251396648044693"
## [3] "Mean value for SNP chr22_16052962 is 0.153631284916201"
## [4] "Mean value for SNP chr22_16052986 is 0.108938547486034"
## [5] "Mean value for SNP chr22_16053444 is 0.108938547486034"
## [6] "Mean value for SNP chr22_16053659 is 1.4413407821229"
## [7] "Mean value for SNP chr22_16053791 is 0.53072625698324"
## [8] "Mean value for SNP chr22_16053862 is 0.187150837988827"
## [9] "Mean value for SNP chr22_16053863 is 0.251396648044693"
## [10] "Mean value for SNP chr22_16054454 is 0.187150837988827"
paste("Standard deviation for SNP ", snps$snipid[1:10]," is ", t(apply(snps[1:10, -1], 1, sd)))
## [1] "Standard deviation for SNP chr22_16051249 is 0.418282783518628"
## [2] "Standard deviation for SNP chr22_16052080 is 0.434423220035923"
## [3] "Standard deviation for SNP chr22_16052962 is 0.376294327742715"
## [4] "Standard deviation for SNP chr22_16052986 is 0.320850570593205"
## [5] "Standard deviation for SNP chr22_16053444 is 0.320850570593205"
## [6] "Standard deviation for SNP chr22_16053659 is 0.635698421409098"
## [7] "Standard deviation for SNP chr22_16053791 is 0.619844588756408"
## [8] "Standard deviation for SNP chr22_16053862 is 0.418282783518628"
## [9] "Standard deviation for SNP chr22_16053863 is 0.434423220035923"
## [10] "Standard deviation for SNP chr22_16054454 is 0.418282783518628"
Minor allele frequency (MAF) is a measure of the presence of an allele in a population. More precisely MAF refers to the frequency at which the second most common allele occurs in a given population. Every individual person has about 4.1-5 million bases in which he differs from the reference genome (ref). Some of those variants are common in a certain population, others not. In order to measure the “rareness”" of a specific variant (allele) MAF can be calculated. In case of single nucleotide polymorphisms there can be up to four different alleles in one position (A, C, G, T). Those alleles can be homozygous or heterozygous, when the maternal allele was different from the paternal allele at that position. SNPs in a population are always defined by the genomic position and by two alleles: The allele defined in the reference genome and one allele present in some individuals, but different from the reference sequence.
An example:
Spotting the differences.
This individual has two SNPs, he is heterozygous in one of them and homozygous in the other. In eQTL analyses a SNP is always defined as a single allele being different from the reference. If in a population there are multiple different alleles for one position, they would be treated as independent entities of SNPs. Now we know that in eQTL analyses SNPs can only have two alleles: The reference and the alternative. Calculating MAF is essentially counting the presence of the alleles in a population and representing it as a percentage. Each individual can have 0, 1 or 2 times the alternative allele.
The term MAF implies that the allele for which we return the measure has to be the minor (= less common) allele. This means that the MAF is smaller than 0.5 by definition.
So let’s now calculate the MAF for all SNPs among all individuals and correct the returned values so that the value is always given in respect to the minor allele. Then we’ll plot a histogram of the MAFs of all SNPs
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
maf = rowMeans(snps[-1])/2
sum(maf > 0.5) # so there are 21130 alleses which are actually major alleles in this population sample
## [1] 21130
# To ensure we actually get the MAF this needs to be inverted.
maf <- pmin(maf, 1-maf)
truehist(maf, main = "Histogram of MAF values.", col = "steelblue")
lines(density(maf), lty = 2, col = "darkorange", lwd = 3)
In an eQTL study often a minimum MAF is required. Since MAF essentially reflects how often an allele has been observed in a population, it also defines how often the gene expression levels have been observed for heterozygous and homozygous alleles.
Calculate the number of heterozygous and homozygous observations expected for SNPs with a MAF of 5%, 10% and 15% in a sample of 500 individuals given Hardy-Weinberg equilibrium. What are useful MAF thresholds for SNPs to include in an eQTL analysis?
p = c(0.05, 0.1, 0.15)
q = 1-p
# Calulate frequency of minor allele being present in homozygous and heterozygous state
f_hom = p^2
f_het = 2*p*q
# Expected number of observations in a sample size of 10000
sample_size = 500
round(f_hom * sample_size)
## [1] 1 5 11
round(f_het * sample_size)
## [1] 48 90 128
Now that we have an idea of what is stored in the genotype data frame let’s take a look at the expression data. For eQTL analyses it is important for the gene expression to be normally distributed among samples, therefore RNA-seq data has to be transformed by, for example quantile normalization.
Let’s check the distribution of gene expression levels across samples for the first gene in our expr data frame.
gname = expr$geneid[1]
gname
## [1] "ENSG00000215270.3"
truehist(as.numeric(filter(expr, geneid == gname)[-1]), main = paste("Gene expression profile for gene:",gname), xlab = "Expression level", col = "darkorange")
lines(density(as.numeric(filter(expr, geneid == gname)[-1])), lty = 2, col = "steelblue", lwd = 3)
Now we’ll plot the expression levels of the first gene against the first SNP, 10th gene against the 10th SNP, 10th gene against the 1th SNP, as well as the 10th gene against the 6th SNP, depending on the genotypes of the samples by using simple dot plots. We’ll add a bit of random noise (jitter) to the genotype data to make it all look more comprehensible.
genotype = c("snp_1", "snp_10", "snp_1", "snp_6")
genes = c("gene_1", "gene_10", "gene_10", "gene_10")
par(mfrow=c(1,length(genotype)))
plot(jitter(as.numeric(snps[1,-1]), factor = 0.5), as.numeric(expr[1,-1]),
xlab = genotype[1], ylab = genes[1], col = "steelblue",
main = paste(genes[1], "vs", genotype[1]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
plot(jitter(as.numeric(snps[10,-1]), factor = 0.5), as.numeric(expr[10,-1]),
xlab = genotype[2], ylab = genes[2], col = "steelblue",
main = paste(genes[2], "vs", genotype[2]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
plot(jitter(as.numeric(snps[1,-1]), factor = 0.5), as.numeric(expr[10,-1]),
xlab = genotype[3], ylab = genes[3], col = "steelblue",
main = paste(genes[3], "vs", genotype[3]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
plot(jitter(as.numeric(snps[6,-1]), factor = 0.5), as.numeric(expr[10,-1]),
xlab = genotype[4], ylab = genes[4], col = "steelblue",
main = paste(genes[4], "vs", genotype[4]), xlim= c(-0.5,2.5), xaxt="n")
axis(1, at =c (0,1,2), labels = c("0", "1", "2"))
Let’s do a bit of data wrangling for easier downstream analysis and efficient plotting. Namely we’ll transpose our data frames so that we have variables, i.e. SNPs and expression levels as columns and samples as rows.
expr_trans = data.frame(t(expr[, -1]))
colnames(expr_trans)=t(expr[, 1])
expr_trans = tibble::rownames_to_column(expr_trans, "sample")
head(expr_trans)[1:10]
## sample ENSG00000215270.3 ENSG00000237438.2 ENSG00000273203.1
## 1 HG00096 0.7878784 0.46099598 0.33084190
## 2 HG00097 1.6030633 1.29599155 0.64136258
## 3 HG00099 1.2839206 0.03093323 0.02358726
## 4 HG00100 1.8733944 0.93279608 0.11472217
## 5 HG00101 1.8611959 1.44133066 0.46404172
## 6 HG00102 0.5027009 0.12111466 0.21109161
## ENSG00000273442.1 ENSG00000177663.9 ENSG00000183307.3 ENSG00000069998.8
## 1 0.15617620 3.064441 0.04658481 11.122808
## 2 0.29658085 2.758744 0.02211631 11.310690
## 3 0.00000000 3.167703 0.03985498 13.533813
## 4 0.08664862 2.225839 0.02584587 12.859737
## 5 0.27669990 3.434325 0.05502338 7.176259
## 6 0.09964725 2.151189 0.01486157 6.181117
## ENSG00000093072.11 ENSG00000099954.14
## 1 11.195425 0.011522319
## 2 7.668832 0.047408942
## 3 11.774297 0.006571839
## 4 9.425467 0.000000000
## 5 12.055840 0.000000000
## 6 10.282059 0.007351744
# and the same for genotype data
snps_trans = data.frame(t(snps[-1]))
colnames(snps_trans)=t(snps[, 1])
snps_trans = tibble::rownames_to_column(snps_trans, "sample")
head(snps_trans)[1:10]
## sample chr22_16051249 chr22_16052080 chr22_16052962 chr22_16052986
## 1 HG00096 0 1 0 0
## 2 HG00097 1 0 1 0
## 3 HG00099 1 0 1 0
## 4 HG00100 0 0 0 0
## 5 HG00101 0 0 0 0
## 6 HG00102 1 0 1 0
## chr22_16053444 chr22_16053659 chr22_16053791 chr22_16053862
## 1 0 1 0 0
## 2 0 1 0 1
## 3 0 1 0 1
## 4 0 2 1 0
## 5 0 2 1 0
## 6 0 1 0 1
## chr22_16053863
## 1 1
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
Another convenient way to display gene expression values by genotype is as box plots. These provide a good, nonparametric, indication of the distributions. To convey a sense of the frequency of each genotype in the sample it is useful to also add points for each individual to the plot. Below is an example of how this might look for first ten SNP/gene pairs. This time we’ll use R’s ggplot2 library to generate visualization.
library(ggplot2)
#Reshape dataframes a bit for use with ggplot2
snps_long = tidyr::gather(snps_trans[, 1:5], snp, genotype, -sample)
expr_long = tidyr::gather(expr_trans[, 1:5], gene, expression, -sample)
head(snps_long)
## sample snp genotype
## 1 HG00096 chr22_16051249 0
## 2 HG00097 chr22_16051249 1
## 3 HG00099 chr22_16051249 1
## 4 HG00100 chr22_16051249 0
## 5 HG00101 chr22_16051249 0
## 6 HG00102 chr22_16051249 1
head(expr_long)
## sample gene expression
## 1 HG00096 ENSG00000215270.3 0.7878784
## 2 HG00097 ENSG00000215270.3 1.6030633
## 3 HG00099 ENSG00000215270.3 1.2839206
## 4 HG00100 ENSG00000215270.3 1.8733944
## 5 HG00101 ENSG00000215270.3 1.8611959
## 6 HG00102 ENSG00000215270.3 0.5027009
data_long <- cbind(snps_long, expr_long["expression"])
data_long$genotype <- as.factor(data_long$genotype)
head(data_long)
## sample snp genotype expression
## 1 HG00096 chr22_16051249 0 0.7878784
## 2 HG00097 chr22_16051249 1 1.6030633
## 3 HG00099 chr22_16051249 1 1.2839206
## 4 HG00100 chr22_16051249 0 1.8733944
## 5 HG00101 chr22_16051249 0 1.8611959
## 6 HG00102 chr22_16051249 1 0.5027009
ggplot(data_long, aes(genotype, expression)) +
geom_jitter(colour = "darkorange",alpha = 0.3, width = 0.02) +
geom_boxplot(alpha = 0.5, fill = "steelblue") +
facet_wrap(~snp)
# Let's do that for more SNPs and genes, just to see whether we could catch any
# visible correlations
snps_long = tidyr::gather(snps_trans[, 1:10], snp, genotype, -sample)
expr_long = tidyr::gather(expr_trans[, 1:10], gene, expression, -sample)
head(snps_long)
## sample snp genotype
## 1 HG00096 chr22_16051249 0
## 2 HG00097 chr22_16051249 1
## 3 HG00099 chr22_16051249 1
## 4 HG00100 chr22_16051249 0
## 5 HG00101 chr22_16051249 0
## 6 HG00102 chr22_16051249 1
head(expr_long)
## sample gene expression
## 1 HG00096 ENSG00000215270.3 0.7878784
## 2 HG00097 ENSG00000215270.3 1.6030633
## 3 HG00099 ENSG00000215270.3 1.2839206
## 4 HG00100 ENSG00000215270.3 1.8733944
## 5 HG00101 ENSG00000215270.3 1.8611959
## 6 HG00102 ENSG00000215270.3 0.5027009
data_long <- cbind(snps_long, expr_long["expression"])
data_long$genotype <- as.factor(data_long$genotype)
head(data_long)
## sample snp genotype expression
## 1 HG00096 chr22_16051249 0 0.7878784
## 2 HG00097 chr22_16051249 1 1.6030633
## 3 HG00099 chr22_16051249 1 1.2839206
## 4 HG00100 chr22_16051249 0 1.8733944
## 5 HG00101 chr22_16051249 0 1.8611959
## 6 HG00102 chr22_16051249 1 0.5027009
ggplot(data_long, aes(genotype, expression)) +
geom_jitter(colour = "darkorange",alpha = 0.3, width = 0.02) +
geom_boxplot(alpha = 0.5, fill = "steelblue",
position = position_dodge(width = 0)) +
facet_wrap(~snp)
This chapter should explain the basic ideas behind eQTL analyses. What we are doing here is not what one would do to run an actual eQTL analysis. Here we’ll try to explain how eQTL mapping works in general.
The most common way of estimating the effect of a SNP on gene expression is by performing a linear regression of sample genotypes on sample gene expression levels. So to obtain estimates of the genotypic contribution to gene expression we fit a simple linear regression model of the form \(E_i = \beta_0 + \beta G_i + \epsilon\), where \(E_i\) is the vector of gene expression values for gene \(i\) and \(G_i\) is the genotype vector for the SNP \(i\). We are interested in the estimate for \(\beta\) which indicates the change in gene expression for each copy of the second allele
The p-value indicates the significance of the genetic component in the model. Let’s try that for gene 10 with SNP 1 and SNP 6.
lm_1_10 = lm(expr_trans[, 11] ~ snps_trans[, 2])
summary(lm_1_10)
##
## Call:
## lm(formula = expr_trans[, 11] ~ snps_trans[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.056455 -0.021535 -0.003053 0.017321 0.145656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.056455 0.001792 31.510 <2e-16 ***
## snps_trans[, 2] -0.002931 0.003914 -0.749 0.455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03094 on 356 degrees of freedom
## Multiple R-squared: 0.001572, Adjusted R-squared: -0.001232
## F-statistic: 0.5605 on 1 and 356 DF, p-value: 0.4545
lm_6_10 = lm(expr_trans[, 11] ~ snps_trans[, 7])
summary(lm_6_10)
##
## Call:
## lm(formula = expr_trans[, 11] ~ snps_trans[, 7])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.056641 -0.022338 -0.003267 0.017837 0.146785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.054012 0.004058 13.31 <2e-16 ***
## snps_trans[, 7] 0.001315 0.002577 0.51 0.61
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03095 on 356 degrees of freedom
## Multiple R-squared: 0.0007307, Adjusted R-squared: -0.002076
## F-statistic: 0.2603 on 1 and 356 DF, p-value: 0.6102
This is the standard summary output from R for linear regressions. Since we are interested in eQTLs our main interest lies in the second line of “Coefficients”. What is stated as “Estimate” is the slope of the linear regression, which in eQTL terms is called “effect size” or already mentioned “beta”. In eQTL studies one normally compares thousands of genes for which each hundreds to thousands of SNPs have been tested. The common way to identify eQTLs is by their p-value. The p-value given here as (Pr(>|t|)) will later be referred to as raw p-value. It can be calculated in many different ways, here it is based on the t-value which is derived from the estimation of the coefficient and its standard error. For a nice explanation of the summary(lm) output check here.
It is obvious that these two models don’t catch any significant effect of explored SNPs and gene expression levels. In fact it is very hard to catch such effects by “manually” exploring real-world genomic data, usually containing tens or hundreds of thousands SNPs and hundreds or thousands gene expressions for large number of samples.
Hence for the sake of making a comprehensable example we’ll partially repeat the previous EDA process with a dummy, i.e. simulated data set, which can be found here
gt = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated/sim_genotypes.tab", sep="\t",
header=TRUE, row.names = 1)
expr = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated/sim_expression1.tab", sep="\t",
header=TRUE, row.names = 1)
head(gt)
## snp_1 snp_2 snp_3 snp_4 snp_5 snp_6 snp_7 snp_8 snp_9 snp_10
## sample_1 0 0 0 0 1 0 0 0 2 2
## sample_2 0 0 0 0 0 0 0 0 2 2
## sample_3 0 0 0 0 0 0 0 0 1 2
## sample_4 0 0 0 0 0 0 1 1 0 1
## sample_5 0 0 0 0 0 0 0 1 0 1
## sample_6 0 1 0 0 0 0 0 0 0 0
head(expr)
## gene_1 gene_2 gene_3 gene_4 gene_5 gene_6 gene_7
## sample_1 6.091035 6.479307 6.702659 7.260538 8.196299 6.061722 6.548776
## sample_2 7.432563 7.016138 7.508599 7.295998 7.836607 7.825705 7.514788
## sample_3 6.917247 6.127383 6.097665 6.324987 6.017944 5.888059 6.912573
## sample_4 7.318114 6.791866 7.942633 6.894796 7.498120 7.199753 7.900366
## sample_5 7.288953 6.982799 8.120189 7.400018 7.401130 7.032545 7.413375
## sample_6 7.688428 8.420471 7.709514 8.422695 6.536603 8.340534 8.668130
## gene_8 gene_9 gene_10
## sample_1 6.316635 9.646378 9.811172
## sample_2 8.359249 10.433948 10.455313
## sample_3 6.830853 8.312266 9.678442
## sample_4 8.651775 6.560307 9.509531
## sample_5 8.948665 7.482829 8.749063
## sample_6 7.582129 7.638559 8.011134
dim(gt)
## [1] 300 10
dim(expr)
## [1] 300 10
# Calculate MAF and draw MAF histogram
maf <- colMeans(gt)/2
maf
## snp_1 snp_2 snp_3 snp_4 snp_5 snp_6
## 0.006666667 0.030000000 0.020000000 0.065000000 0.046666667 0.126666667
## snp_7 snp_8 snp_9 snp_10
## 0.171666667 0.298333333 0.390000000 0.511666667
maf <- pmin(maf, 1-maf)
maf
## snp_1 snp_2 snp_3 snp_4 snp_5 snp_6
## 0.006666667 0.030000000 0.020000000 0.065000000 0.046666667 0.126666667
## snp_7 snp_8 snp_9 snp_10
## 0.171666667 0.298333333 0.390000000 0.488333333
sum(maf > 0.5)
## [1] 0
truehist(maf, main = "Histogram of MAF values.", col = "steelblue")
lines(density(maf), lty = 2, col = "darkorange", lwd = 3)
snps = c("snp_1", "snp_10", "snp_1")
genes = c("gene_1", "gene_10", "gene_10")
par(mfrow=c(1,length(snps)))
for (index in seq(length(snps))){
genotype = gt[[snps[index]]]
expression = expr[[genes[index]]]
plot(jitter(genotype, factor = 0.4), expression,
main=paste(snps[index], "vs", genes[index]), xlim= c(-0.5,2.5),
xlab = "genotype", xaxt="n", col ="steelblue")
axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
}
#Reshape dataframes a bit for use with ggplot2
genoLong = tidyr::gather(gt, snp, genotype)
exprLong = tidyr::gather(expr, gene, expression)
dataLong = cbind(genoLong, exprLong["expression"])
dataLong$genotype = as.factor(dataLong$genotype)
ggplot(dataLong, aes(genotype, expression)) +
geom_jitter(colour = "darkorange",alpha = 0.3, width = 0.2) +
geom_boxplot(alpha=0.6, fill="steelblue",
position=position_dodge()) +
facet_wrap(~snp)
Let’s again try linear regression for gene 10 with snp 1 and snp 10
lm_1_10 = lm(expr[, "gene_10"]~ gt[, "snp_1"])
summary(lm_1_10)
##
## Call:
## lm(formula = expr[, "gene_10"] ~ gt[, "snp_1"])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5262 -1.1412 0.0577 1.1552 4.0071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.55324 0.09138 93.600 <2e-16 ***
## gt[, "snp_1"] -1.07768 0.79138 -1.362 0.174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.572 on 298 degrees of freedom
## Multiple R-squared: 0.006185, Adjusted R-squared: 0.00285
## F-statistic: 1.854 on 1 and 298 DF, p-value: 0.1743
lm_10_10 = lm(expr[, "gene_10"] ~ gt[,"snp_10"])
summary(lm_10_10)
##
## Call:
## lm(formula = expr[, "gene_10"] ~ gt[, "snp_10"])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2863 -0.7161 0.0474 0.6658 2.4429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.88488 0.10941 62.93 <2e-16 ***
## gt[, "snp_10"] 1.61627 0.08788 18.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.079 on 298 degrees of freedom
## Multiple R-squared: 0.5317, Adjusted R-squared: 0.5301
## F-statistic: 338.3 on 1 and 298 DF, p-value: < 2.2e-16
Let’s plot these regression lines over scatter plots from before:
snps = c("snp_1", "snp_10", "snp_1")
genes = c("gene_1", "gene_10", "gene_10")
par(mfrow=c(1,length(snps)))
for (index in seq(length(snps))){
genotype = gt[[snps[index]]]
expression = expr[[genes[index]]]
lm_result = lm(expression ~ genotype)
plot(jitter(genotype, factor = 0.4), expression,
main=paste(snps[index], "vs", genes[index]), xlim= c(-0.5,2.5),
xlab = "genotype", xaxt="n", col ="steelblue")
abline(lm_result, col="darkorange")
axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
# Add p-values as text
y_range = range(expression)
text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
format(summary(lm_result)$coefficients[2,4],
scentific=TRUE, digits=2)))
}
A nicer way to plot these results via ggplot2:
genoLong = tidyr::gather(gt, snp, genotype, snp_1, snp_10)
exprLong = tidyr::gather(expr, gene, expression, gene_1, gene_10)
dataLong = cbind(genoLong[,c("snp", "genotype")], exprLong[,c("gene", "expression")])
dataLong$comparison = paste(dataLong$snp, "vs", dataLong$gene)
dataLong$genotype = factor(dataLong$genotype)
ggplot(dataLong, aes(genotype, expression)) +
geom_jitter(col="darkorange", position=position_jitter(width=0.25)) +
geom_boxplot(outlier.size=0, alpha=0.6, fill="steelblue") +
geom_smooth(method = 'lm',col="darkred", aes(group=1), se=FALSE) +
facet_wrap(~comparison)
fit <- mapply(function(e, g) lm(e ~ g), expr, gt, SIMPLIFY=FALSE)
betaHat <- sapply(fit, coef)[2,]
betaHat
## gene_1 gene_2 gene_3 gene_4 gene_5 gene_6 gene_7
## 0.9416734 1.3518024 1.1324493 1.5005237 1.4329068 1.2928385 1.5106722
## gene_8 gene_9 gene_10
## 1.6050215 1.6038507 1.6162730
We’ll use the R’s function confint to obtain 95% confidence intervals of the estimated SNP effects.
ci <- sapply(fit, confint, "g")
rownames(ci) <- c("lower", "upper")
ci
## gene_1 gene_2 gene_3 gene_4 gene_5 gene_6 gene_7
## lower -0.1043169 0.8457378 0.5723167 1.156449 1.017273 1.026215 1.286452
## upper 1.9876638 1.8578670 1.6925819 1.844598 1.848541 1.559462 1.734892
## gene_8 gene_9 gene_10
## lower 1.412641 1.426662 1.443334
## upper 1.797402 1.781040 1.789212
Now let’s plot confidence intervals for \(\hat{\beta_i}\) in respect to MAFs.
estimates = data.frame(Estimate=betaHat, t(ci), MAF=maf)
ggplot(estimates, aes(x=MAF)) +
geom_hline(yintercept=1.5, colour = "darkorange") +
geom_hline(yintercept=0, colour = "darkred", linetype="longdash") +
geom_errorbar(aes(ymin=lower, ymax=upper), colour = "steelblue") +
geom_point(aes(y=Estimate), colour = "steelblue")
In this example all resulting confidence intervals include the true value but intervals for small minor allele frequencies are large (and in one case this means that 0 is included in the CI). As one would expect the uncertainty in the estimate, as measured by the width of the confidence interval, decreases with increasing minor allele frequency. However, even at high MAF considerable uncertainty remains and point estimates are somewhat lacking in accuracy, overestimating the true effect.
Many different factors can affect gene expression, such as age, sex, smoking habits, genetic mutations or environmental factors, such as nutrition, etc. The more factors can be described in the model, the more accurate it will be and the higher are chances to find more subtle genetic effects. Covariates therefore are features of samples which may describe effects on gene expression. In technical terms one covariate is therefore a vector of the same length as there are samples, e.g.: age. The examples before worked nicely, because it was simulated data without any covariates. Now we will be using data where covariates have been modelled additionally. Let’s see how linear regression models behave depending on inclusion of covaraiates. We’ll calculate the linear regression with and without the covariates for combination of gene 10 with snp 10.
expr_cov = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated//sim_expression2.tab", sep="\t",
header=TRUE, row.names = 1)
covariates = read.table("/Users/igorhut/Documents/eQTL/jknightlab-eqtl-intro-e17c9f5/docker/data/simulated//sim_covariates.tab", sep="\t",
header=TRUE, row.names = 1)
head(expr_cov, 3)
## gene_1 gene_2 gene_3 gene_4 gene_5 gene_6
## sample_1 11.950277 12.338549 12.561901 13.119780 14.055542 11.92096
## sample_2 19.742334 19.325909 19.818370 19.605769 20.146379 20.13548
## sample_3 -1.509752 -2.299616 -2.329334 -2.102012 -2.409055 -2.53894
## gene_7 gene_8 gene_9 gene_10
## sample_1 12.408018 12.175877 15.5056204 15.670415
## sample_2 19.824559 20.669020 22.7437190 22.765084
## sample_3 -1.514426 -1.596146 -0.1147331 1.251443
head(covariates, 3)
## var_1 var_2 var_3 var_4 var_5 var_6
## sample_1 0.1805229 0.65730433 2.1202020 0.7086180 0.2261268 -1.3233594
## sample_2 0.7847340 1.03631767 0.3571627 0.8541129 -0.3856621 1.0153862
## sample_3 -1.3531646 -0.04268473 0.4329161 0.1914436 -1.0722882 0.1656182
## var_7 var_8 var_9 var_10 var_11 var_12
## sample_1 0.0693342 -1.2700925 1.9314654 0.1893478 -0.02579421 1.4651498
## sample_2 0.9291073 0.7901720 0.7504875 0.1566840 0.46778575 0.4504485
## sample_3 -0.9194574 -0.1477517 -2.5731055 0.2177240 0.26250697 0.4895526
## var_13 var_14 var_15 var_16 var_17 var_18
## sample_1 -1.7309505 -0.8570105 0.3502420 -0.5500488 0.5100173 -1.7280022
## sample_2 0.2793073 -0.4621151 0.8739229 -0.9148169 1.0457920 -1.0324303
## sample_3 1.1093786 0.1780986 -0.7811387 0.4241464 -0.4223912 0.1432051
## var_19 var_20
## sample_1 0.6182252 -0.09250967
## sample_2 -3.0233684 -0.16821972
## sample_3 -1.5812401 0.91562847
dim(expr_cov)
## [1] 300 10
dim(covariates)
## [1] 300 20
lm_10_10 = lm(expr_cov[,"gene_10"] ~ gt[,"snp_10"])
summary(lm_10_10)
##
## Call:
## lm(formula = expr_cov[, "gene_10"] ~ gt[, "snp_10"])
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.8886 -4.4990 -0.2714 5.1006 20.2046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8321 0.7492 7.784 1.16e-13 ***
## gt[, "snp_10"] 2.9265 0.6018 4.863 1.87e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.391 on 298 degrees of freedom
## Multiple R-squared: 0.07353, Adjusted R-squared: 0.07042
## F-statistic: 23.65 on 1 and 298 DF, p-value: 1.873e-06
lm_10_10_covs = lm(expr_cov[,"gene_10"] ~ gt[,"snp_10"] + as.matrix(covariates))
summary(lm_10_10_covs)
##
## Call:
## lm(formula = expr_cov[, "gene_10"] ~ gt[, "snp_10"] + as.matrix(covariates))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4494 -0.6804 0.0657 0.6722 2.5683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.82775 0.11393 59.928 < 2e-16 ***
## gt[, "snp_10"] 1.65776 0.09172 18.074 < 2e-16 ***
## as.matrix(covariates)var_1 2.50006 0.06416 38.967 < 2e-16 ***
## as.matrix(covariates)var_2 2.24993 0.07045 31.935 < 2e-16 ***
## as.matrix(covariates)var_3 2.21608 0.06487 34.163 < 2e-16 ***
## as.matrix(covariates)var_4 2.20670 0.06667 33.097 < 2e-16 ***
## as.matrix(covariates)var_5 2.18179 0.06466 33.740 < 2e-16 ***
## as.matrix(covariates)var_6 1.90274 0.06336 30.031 < 2e-16 ***
## as.matrix(covariates)var_7 1.87298 0.06779 27.631 < 2e-16 ***
## as.matrix(covariates)var_8 1.67675 0.06664 25.163 < 2e-16 ***
## as.matrix(covariates)var_9 1.64621 0.06771 24.311 < 2e-16 ***
## as.matrix(covariates)var_10 1.55134 0.06076 25.531 < 2e-16 ***
## as.matrix(covariates)var_11 1.48276 0.06598 22.475 < 2e-16 ***
## as.matrix(covariates)var_12 1.31934 0.06802 19.396 < 2e-16 ***
## as.matrix(covariates)var_13 1.23396 0.06564 18.799 < 2e-16 ***
## as.matrix(covariates)var_14 1.14690 0.06383 17.967 < 2e-16 ***
## as.matrix(covariates)var_15 1.13261 0.06344 17.854 < 2e-16 ***
## as.matrix(covariates)var_16 0.96792 0.06647 14.562 < 2e-16 ***
## as.matrix(covariates)var_17 0.75899 0.06875 11.039 < 2e-16 ***
## as.matrix(covariates)var_18 0.70516 0.06483 10.876 < 2e-16 ***
## as.matrix(covariates)var_19 0.64040 0.06584 9.727 < 2e-16 ***
## as.matrix(covariates)var_20 0.50829 0.06822 7.450 1.18e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.084 on 278 degrees of freedom
## Multiple R-squared: 0.9814, Adjusted R-squared: 0.98
## F-statistic: 699.3 on 21 and 278 DF, p-value: < 2.2e-16
Let’s plot the results:
par(mfrow=c(1,2))
plot(jitter(gt[,"snp_10"], factor = 0.4), expr_cov[,"gene_10"],
main="gene_10 vs snp_10", xlim= c(-0.5,2.5),
xlab = "genotype", xaxt="n", col ="steelblue")
abline(lm_10_10, col="darkorange", lwd = 2, lty = 2)
axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
# Add p-values as text
y_range = range(expr_cov[,"gene_10"])
text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
format(summary(lm_10_10)$coefficients[2,4],
scentific=TRUE, digits=2)), col = "darkorange")
plot(jitter(gt[,"snp_10"], factor = 0.4), expr_cov[,"gene_10"],
main="gene_10 vs snp_10 incl. cov.", xlim= c(-0.5,2.5),
xlab = "genotype", xaxt="n", col ="steelblue")
abline(lm_10_10_covs, col="darkorange", lwd = 2, lty = 2)
## Warning in abline(lm_10_10_covs, col = "darkorange", lwd = 2, lty = 2):
## only using the first two of 22 regression coefficients
axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
# Add p-values as text
y_range = range(expr_cov[,"gene_10"])
text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
format(summary(lm_10_10_covs)$coefficients[2,4],
scentific=TRUE, digits=2)), col = "darkorange")
Further we will explore the use of principle components as covariates in linear models of gene expression to account for unknown sources of variation. Let’s use the data set from the previous exercise and include PC’s, as covariates, in the linear model instead of known covariates we used previously.
Check the data:
head(expr_cov, 3)
## gene_1 gene_2 gene_3 gene_4 gene_5 gene_6
## sample_1 11.950277 12.338549 12.561901 13.119780 14.055542 11.92096
## sample_2 19.742334 19.325909 19.818370 19.605769 20.146379 20.13548
## sample_3 -1.509752 -2.299616 -2.329334 -2.102012 -2.409055 -2.53894
## gene_7 gene_8 gene_9 gene_10
## sample_1 12.408018 12.175877 15.5056204 15.670415
## sample_2 19.824559 20.669020 22.7437190 22.765084
## sample_3 -1.514426 -1.596146 -0.1147331 1.251443
dim(expr_cov)
## [1] 300 10
R provides the function prcomp for computing PCs. Like most standard R functions it expects data to be laid out with variables in columns and samples in rows, so we are good to go.
pca <- prcomp(expr_cov, center=TRUE, scale = TRUE)
head(pca$x, 3)
## PC1 PC2 PC3 PC4 PC5
## sample_1 -2.276793 -0.06354526 0.3304429 0.11776561 0.07185696
## sample_2 -5.358711 -0.11632936 0.2518289 -0.02015748 0.06394395
## sample_3 3.919988 0.06913189 0.2497241 -0.06144515 0.10165365
## PC6 PC7 PC8 PC9 PC10
## sample_1 0.202507384 -0.038418155 -0.10083580 -0.08100452 0.06052746
## sample_2 -0.004416506 -0.001257329 -0.03820300 -0.08312656 -0.02748060
## sample_3 0.058176697 -0.001891389 0.09413981 -0.03593804 -0.09582196
As you can see, since we have 10 variables, we got 10 PCs in total. Let’s plot percentage of explained variance for all PCs.
library(factoextra)
plot(pca)
fviz_eig(pca, addlabels = TRUE)
The variance accounted for by each component is available through the
sdev field of the prcomp return value:
pca$sdev
## [1] 3.14394553 0.15615397 0.14470210 0.13538318 0.12397628 0.10219570
## [7] 0.08938052 0.08844315 0.07362859 0.07006292
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.1439 0.15615 0.14470 0.13538 0.12398 0.10220
## Proportion of Variance 0.9884 0.00244 0.00209 0.00183 0.00154 0.00104
## Cumulative Proportion 0.9884 0.99088 0.99297 0.99480 0.99634 0.99739
## PC7 PC8 PC9 PC10
## Standard deviation 0.08938 0.08844 0.07363 0.07006
## Proportion of Variance 0.00080 0.00078 0.00054 0.00049
## Cumulative Proportion 0.99818 0.99897 0.99951 1.00000
# or
sum(pca$sdev[1:5]^2)/sum(pca$sdev^2) # percentage of variace explained by first 5 PCs
## [1] 0.9963415
pca$sdev[1]^2/sum(pca$sdev^2) # percentage of variance explained by the 1st PC
## [1] 0.9884394
Since this is specifically modeled data, the first PC accounts for major part of variance. Let’s do modeling and see how the model behaves depending on inclusion of PCs.
# no PCs included
lm_9_9 = lm(expr_cov[,"gene_9"] ~ gt[,"snp_9"])
summary(lm_9_9)
##
## Call:
## lm(formula = expr_cov[, "gene_9"] ~ gt[, "snp_9"])
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.9702 -4.2002 0.0046 4.9352 20.0212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.3809 0.6442 11.457 <2e-16 ***
## gt[, "snp_9"] 1.3727 0.6142 2.235 0.0262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.459 on 298 degrees of freedom
## Multiple R-squared: 0.01648, Adjusted R-squared: 0.01318
## F-statistic: 4.994 on 1 and 298 DF, p-value: 0.02617
# let's now include the 1st PC
lm_9_9_PC1 = lm(expr_cov[,"gene_9"] ~ gt[,"snp_9"] + as.numeric(pca$x[,1]))
summary(lm_9_9_PC1)
##
## Call:
## lm(formula = expr_cov[, "gene_9"] ~ gt[, "snp_9"] + as.numeric(pca$x[,
## 1]))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.36081 -0.34148 -0.02407 0.33541 1.39800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.410309 0.043561 170.11 <2e-16 ***
## gt[, "snp_9"] 1.334989 0.041534 32.14 <2e-16 ***
## as.numeric(pca$x[, 1]) -2.363213 0.009278 -254.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5044 on 297 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9955
## F-statistic: 3.298e+04 on 2 and 297 DF, p-value: < 2.2e-16
Let’s again plot the results:
par(mfrow=c(1,2))
plot(jitter(gt[,"snp_9"], factor = 0.4), expr_cov[,"gene_9"],
main="gene_9 vs snp_9", xlim= c(-0.5,2.5),
xlab = "genotype", xaxt="n", col ="steelblue")
abline(lm_9_9, col="darkorange", lwd = 2, lty = 2)
axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
# Add p-values as text
y_range = range(expr_cov[,"gene_9"])
text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
format(summary(lm_9_9)$coefficients[2,4],
scentific=TRUE, digits=2)), col = "darkorange")
plot(jitter(gt[,"snp_9"], factor = 0.4), expr_cov[,"gene_9"],
main="gene_9 vs snp_9 incl. PC1", xlim= c(-0.5,2.5),
xlab = "genotype", xaxt="n", col ="steelblue")
abline(lm_9_9_PC1, col="darkorange", lwd = 2, lty = 2)
## Warning in abline(lm_9_9_PC1, col = "darkorange", lwd = 2, lty = 2): only
## using the first two of 3 regression coefficients
axis(1, at=c(0,1,2), labels = c("0", "1", "2"))
# Add p-values as text
y_range = range(expr_cov[,"gene_9"])
text(x=1, y=y_range[1] + 0.95*diff(y_range), paste0("p=",
format(summary(lm_9_9_PC1)$coefficients[2,4],
scentific=TRUE, digits=2)), col = "darkorange")
Now that we’ve covered the basics let’s focus on real world expression and genotyping data. We’ll be using R’s MatrixEQTL package which is designed for fast eQTL analysis on large genomic data sets. MatrixEQTL can test for association between genotype and gene expression using linear regression with either additive or ANOVA genotype effects. The models can include covariates to account for factors as population stratification, gender, and clinical variables. It also supports models with heteroscedastic and/or correlated errors, false discovery rate estimation and separate treatment of local (cis) and distant (trans) eQTLs. You can learn more about MatrixEQTL here.
Genotype and expression data come in all sorts of flavours and preprocessing of this data needs to be done rather carefully. In this short introduction it is not possible to cover this topic in detail. Common input formats for genotypes are VCF, PLINK files, or even other custom files which give the genotype of each sample on all queried genomic positions. Expression data can also be made available in various formats depending on the underlying technology (RNAseq or expression micro array). Good portion of publicly available eQTL data sets are, still, from microarray experiments.
To perform an eQTL analysis we don’t only need to know the genotype and gene expression values for every sample, but also the genomic positions of genes and SNPs. This is necessary to define which SNPs should be tested against which genes. For cis-eQTL analyses SNPs in proximity to the gene are chosen and for trans-eQTL analyses SNPs further away, or on different chromosomes, are taken into account. The window in cis-eQTL analysis is commonly chosen to be 500kb-1Mb measured from gene’s TSS.
Now let’s go through an example of cis- trans-eQTL mapping with chr22_GEUVADIS_358_continental data.
# eQTL mapping, cis/trans, no pcs
suppressMessages(library(MatrixEQTL))
SNP_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/SNP_voja.txt";
snps_location_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/snpsloc_voja.txt";
expression_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/GE_voja.txt";
gene_location_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/geneloc_voja.txt";
covariates_file_name <- "/Users/igorhut/Documents/eQTL/Data/Voja_QTL_tools/cov_voja.txt";
cis_threshold <- 1e-5
trans_threshold <- 1e-5
cis_dist <- 1e6
# Output file name
output_file_name_cis = tempfile();
output_file_name_tra = tempfile();
# output_file_name_cis = "/Users/igorhut/Documents/eQTL/tools_comparison_R_proj/output_voja_cis_eqtls_1.txt";
# output_file_name_tra = "/Users/igorhut/Documents/eQTL/tools_comparison_R_proj/output_voja_trans_eqtls_1.txt";
## Settings
# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS
useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
# Only associations significant at this level will be saved
pvOutputThreshold_cis = cis_threshold;
pvOutputThreshold_tra = trans_threshold;
# Set to character() for no covariates
# covariates_file_name = character();
# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric();
# Distance for local gene-SNP pairs
cisDist = cis_dist
## Load genotype data
snps = SlicedData$new();
snps$fileDelimiter = "\t"; # the TAB character
snps$fileOmitCharacters = "NA"; # denote missing values;
snps$fileSkipRows = 1; # one row of column labels
snps$fileSkipColumns = 1; # one column of row labels
snps$fileSliceSize = 2000; # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name);
## Load gene expression data
gene = SlicedData$new();
gene$fileDelimiter = "\t"; # the TAB character
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1; # one row of column labels
gene$fileSkipColumns = 1; # one column of row labels
gene$fileSliceSize = 2000; # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name);
## Load covariates
cvrt = SlicedData$new();
cvrt$fileDelimiter = "\t"; # the TAB character
cvrt$fileOmitCharacters = "NA"; # denote missing values;
cvrt$fileSkipRows = 1; # one row of column labels
cvrt$fileSkipColumns = 1; # one column of row labels
if(length(covariates_file_name)>0) {
cvrt$LoadFile(covariates_file_name);
}
## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
me = Matrix_eQTL_main(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name_tra,
pvOutputThreshold = pvOutputThreshold_tra,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = FALSE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snpspos,
genepos = genepos,
cisDist = cisDist,
pvalue.hist = TRUE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE);
me_qq = Matrix_eQTL_main(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name_tra,
pvOutputThreshold = pvOutputThreshold_tra,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = FALSE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snpspos,
genepos = genepos,
cisDist = cisDist,
pvalue.hist = "qqplot",
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
unlink(output_file_name_tra);
unlink(output_file_name_cis);
cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');
## Analysis done in: 21.943 seconds
head(me$cis$eqtls)
## snps gene statistic pvalue FDR
## 1 chr22_41256802 ENSG00000172404.4 31.85769 9.838311e-99 2.697776e-92
## 2 chr22_41272143 ENSG00000172404.4 30.00462 9.694354e-93 1.329151e-86
## 3 chr22_41317083 ENSG00000172404.4 -29.79930 4.593491e-92 3.148968e-86
## 4 chr22_41339367 ENSG00000172404.4 -29.79930 4.593491e-92 3.148968e-86
## 5 chr22_41217786 ENSG00000172404.4 29.73128 7.699892e-92 4.222795e-86
## 6 chr22_41353455 ENSG00000172404.4 -29.58144 2.407304e-91 1.100183e-85
## beta
## 1 0.3185012
## 2 0.3081578
## 3 -0.3086846
## 4 -0.3086846
## 5 0.3079505
## 6 -0.3074922
head(me$trans$eqtls)
## snps gene statistic pvalue FDR
## 1 chr22_21722525 ENSG00000215513.7 -13.268920 5.428888e-32 2.672792e-24
## 2 chr22_18727810 ENSG00000215513.7 9.899769 3.340484e-20 8.223064e-13
## 3 chr22_21722525 ENSG00000206176.5 -9.647465 2.219881e-19 3.643029e-12
## 4 chr22_20400371 ENSG00000183506.12 8.409424 1.638824e-15 1.833918e-08
## 5 chr22_18727198 ENSG00000215513.7 8.364461 2.234997e-15 1.833918e-08
## 6 chr22_18727222 ENSG00000215513.7 8.364461 2.234997e-15 1.833918e-08
## beta
## 1 -0.56291102
## 2 0.32524771
## 3 -0.22652690
## 4 0.05278156
## 5 0.27834246
## 6 0.27834246
## Make the histogram of local and distant p-values
plot(me)
## Make the qq-plot of local and distant p-values
plot(me_qq)
Whenever multiple statistical tests are performed, a multiple testing correction has to be performed. This is necessary because many hypotheses are tested. Therefore each calculated association p-value has to be corrected for multiple testing. MatrixEQTL does this for you automatically and returns the corrected p-value as a false discovery rate (FDR). Common thresholds on FDR are 5% or 10%.
Linkage disequilibrium (LD) is a very important effect that plays a big role in genetic association studies. It describes the fact that genetic variants are not always inherited independently due to recombination patterns during reproduction. SNPs in LD are inherited in similar patterns and therefore can explain gene expression in similar ways. This means that LD makes it harder for association studies to identify one single SNP being associated with altered gene expression. Also it is possible that the combination of SNPs (as a haplotype) causes differences in gene expression and not only one single SNP. Watch this video which explains the basics of LD.
Commonly one selects at most one associated SNP per gene. If there are many SNPs associated with a gene it is most likely that those SNPs are highly linked to each other (“in high LD”) and therefore they describe the same effect. There are still cases in which genes are regulated by different SNPs independently, this cannot be readily determined from the table produced by MatrixEQTL. Here we will not try to identify the independent lead eQTL signals.
Let’s indentify which SNPs are (significantly) associated with which genes at a maximum FDR of 5% from the cis-eQTL results. We’ll print a table in which only the lead SNP per gene is given. Also we’ll add the MAF for every SNP in the table.
library(dplyr)
snp_values = read.table(SNP_file_name, row.names=1, header=TRUE)
snp_values = data.frame(snps = rownames(snp_values), snp_values, stringsAsFactors = FALSE)
top_eqtls = filter(me$cis$eqtls, FDR <= 0.05) %>%
arrange(FDR) %>%
distinct(gene, .keep_all = TRUE)
mafs = apply(as.matrix(snp_values[-1]),1,mean)/2
mafs = pmin(mafs, 1 - mafs)
mafs = data.frame(snps=names(mafs), maf = mafs)
top_eqtls = left_join(top_eqtls, mafs, by="snps")
head(top_eqtls)
## snps gene statistic pvalue FDR
## 1 chr22_41256802 ENSG00000172404.4 31.85769 9.838311e-99 2.697776e-92
## 2 chr22_45726345 ENSG00000100376.7 -28.29562 4.787518e-87 1.875417e-81
## 3 chr22_42465260 ENSG00000183172.8 27.96028 6.544805e-86 9.445576e-81
## 4 chr22_26921725 ENSG00000260065.1 27.92949 8.327123e-86 9.788951e-81
## 5 chr22_24334948 ENSG00000184674.8 -27.29725 1.200698e-83 4.988560e-79
## 6 chr22_51001271 ENSG00000217442.3 25.63617 7.118432e-78 1.384365e-73
## beta maf
## 1 0.3185012 0.43435754
## 2 -3.5781293 0.13268156
## 3 4.0907715 0.31703911
## 4 0.4825401 0.47206704
## 5 -2.5667483 0.36312849
## 6 0.9441055 0.06284916
Whenever multiple statistical tests are performed, a multiple testing correction needs to be performed. This is necessary in order to compensate for the errors which arise from multiple hypothesis testing. Therefore each calculated association p-value has to be corrected. MatrixEQTL does this for you automatically and returns the corrected p-value as a false discovery rate (FDR). Common thresholds on FDR are 5% or 10%.
There are a few standard plots which are common in eQTL analyses. We already produced one of them earlier when we plotted gene expression versus genotype. This gives a visual insight in how clear the data was and what the linear regression actually detected.
Let’s now plot gene expression vs. genotype for the eqtl with the lowest association FDR-value. Also we will add linear regression line for this isolated case.
# For this we also need df with expression data
gene_values = read.table(expression_file_name, row.names=1, header=TRUE)
gene_values = data.frame(gene = rownames(gene_values), gene_values, stringsAsFactors = FALSE)
top_snp = top_eqtls$snps[1]
top_gene = as.character(top_eqtls$gene[1])
top_snp_data = filter(snp_values, snps == top_snp)
top_gene_data = filter(gene_values, gene == top_gene)
plot_data = t(bind_rows(top_snp_data[-1], top_gene_data[-1]))
colnames(plot_data) = c("snp", "gene_expr")
plot_data = as.data.frame(plot_data)
plot_data$snp = as.factor(plot_data$snp)
head(plot_data)
## snp gene_expr
## HG00096 0 0.2518932
## HG00097 1 0.5218350
## HG00099 2 0.8620147
## HG00100 1 0.5844244
## HG00101 1 0.5139019
## HG00102 2 0.9935348
lm_top = lm(plot_data[,"gene_expr"] ~ as.numeric(plot_data[,"snp"]))
summary(lm_top)
##
## Call:
## lm(formula = plot_data[, "gene_expr"] ~ as.numeric(plot_data[,
## "snp"]))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46029 -0.09368 -0.00761 0.08633 0.49462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.10229 0.02460 -4.158 4.02e-05 ***
## as.numeric(plot_data[, "snp"]) 0.30662 0.01096 27.974 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1458 on 356 degrees of freedom
## Multiple R-squared: 0.6873, Adjusted R-squared: 0.6864
## F-statistic: 782.6 on 1 and 356 DF, p-value: < 2.2e-16
plot(plot_data, col="steelblue",
main = paste0(top_gene, " vs ", top_snp))
abline(lm_top, col="darkorange", lwd = 2, lty = 2)
y_range = range(plot_data[,"gene_expr"])
text(x=2, y=y_range[1] + 0.5*diff(y_range), paste0("p=",
format(summary(lm_top)$coefficients[2,4],
scentific=TRUE, digits=2)), col = "darkorange")
Manhattan plots are a way to depict association p-values of multiple SNPs at once. They are also very common in GWAS. Manhattan plots are a rather convinient modality for interpretation of eQTL signals in terms of LD.
Let’s now generate a manhattan plot for gene ENSG00000172404.4, plotting the base-pair position on the x-axis and the \(-log_{10}(pvalue)\) of the SNP in the y axis. Manhattan plots usually depict all tested SNPs, not only the ones passing a certain p-value threshold. Therefore we’ll first obtain all the association p-values for all tested SNPs for gene ENSG00000172404.4.
gene_id = "ENSG00000172404.4"
gene_values = read.table(expression_file_name, row.names=1, header=TRUE)
single_gene_exp = SlicedData$new()
single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id, , drop=FALSE]))
single_gene_exp
## SlicedData object. For more information type: ?SlicedData
## Number of columns: 358
## Number of rows: 1
## Data is stored in 1 slices
## Top left corner of the first slice (up to 10x10):
## HG00096 HG00097 HG00099 HG00100 HG00101
## ENSG00000172404.4 0.2518932 0.521835 0.8620147 0.5844244 0.5139019
## HG00102 HG00103 HG00105 HG00106 HG00108
## ENSG00000172404.4 0.9935348 0.7540972 1.102222 0.5453733 0.1863607
snpspos = read.table(snps_location_file_name,
header = TRUE,
stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name,
header = TRUE,
stringsAsFactors = FALSE)
single_cis_eqtl_res = Matrix_eQTL_main(snps,
single_gene_exp,
verbose = FALSE,
output_file_name.cis = NULL,
output_file_name = NULL,
pvOutputThreshold.cis=1,
snpspos = snpspos,
genepos = genepos)
manh_data = merge(single_cis_eqtl_res$cis$eqtls, snpspos, by.x = "snps", by.y = "snipid")
manh_data = manh_data [,c("pos", "chr", "pvalue", "snps")]
head(manh_data)
## pos chr pvalue snps
## 1 40259861 chr22 0.9238947 chr22_40259861
## 2 40262482 chr22 0.7047404 chr22_40262482
## 3 40263359 chr22 0.6800275 chr22_40263359
## 4 40263997 chr22 0.7054801 chr22_40263997
## 5 40264551 chr22 0.7047404 chr22_40264551
## 6 40264584 chr22 0.9238947 chr22_40264584
# Plot the Manhattanplot
with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)",
main=paste(gene_id, "associated SNPs")))
# Highlight the lead SNP
with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] ,
points(pos, -log10(pvalue), pch=20, col="red"))
# Add a label to the lead SNP
with(manh_data[which.min(manh_data$pvalue),,drop=FALSE],
text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))
Now produce manhattan plots for the top 3 cis-eQTL results and for the bottom 3 cis-eQTL in the top_eqtls matrix we have created earlier.
for (gene_id in top_eqtls$gene[c(1:3,(nrow(top_eqtls)-3):nrow(top_eqtls))]){
print(gene_id)
single_gene_exp = SlicedData$new()
single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id,, drop=FALSE]))
single_cis_eqtl_res = Matrix_eQTL_main(snps, single_gene_exp,
output_file_name.cis = NULL,
output_file_name = NULL,
pvOutputThreshold.cis=1,
verbose = FALSE,
snpspos=snpspos,
genepos=genepos)
manh_data = merge(single_cis_eqtl_res$cis$eqtl,
snpspos,
by.x="snps",
by.y = "snipid")
manh_data =manh_data[,c("pos", "chr", "pvalue", "snps")]
par(mfrow=c(1,1))
# Plot the Manhattanplot
with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)",
main=paste(gene_id, "associated SNPs")))
# Highlight the lead SNP
with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] ,
points(pos, -log10(pvalue), pch=20, col="red"))
# Add a label to the lead SNP
with(manh_data[which.min(manh_data$pvalue),,drop=FALSE],
text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))
scan(stdin())
}
## [1] "ENSG00000172404.4"
## [1] "ENSG00000100376.7"
## [1] "ENSG00000183172.8"
## [1] "ENSG00000273203.1"
## [1] "ENSG00000236850.4"
## [1] "ENSG00000182541.13"
## [1] "ENSG00000183785.10"
Mostly there is a very clear eQTL signal visible, i.e. a distinct peak. Variants which are similar in “height” as the lead cis-eQTL SNP, but lower are most likely SNPs in LD with the lead SNP. In some cases horizontal lines (chunks) become visible which means that those variants are in very high LD among each other, i.e. they are usually inherited together. When variants are in very high LD (horizontal lines) their importance for gene expression cannot be distinguished. Other methods such as fine mapping try to use information like genome segmentation to break the LD blocks into smaller fractions of being more or less likely causal.
Usually cis-eQTL SNPs are located around the transcription starting site (TSS) of the associated gene. Depending on the dataset there may be a slight bias to more associations upstream the TSS. When looking at the SNP positions relative to the TSS one has to take the strand of the gene into account, as up- and downstream are always relative to the strand the gene lies on.
Distance distribution of cis−eQTLs.