\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]
# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis:
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz
#
# Column Names
# ------------
# SNP - SNP rs ID
# hg19_chr - chr number
# hg19_pos - SNP position (hg19)
# AlleleA - Other allele
# AlleleB - Assessed allele
# allA_total - Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total - Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele
# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except
# Framingham Heart Study, because the related samples are present in this cohort.
tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')
# Extract ch16
eqtlGen16=tbl[which(tbl$SNPChr=="16"),]
# Read-in MAFs
eqtl_MAF=read.table(file = "2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt",
sep = "\t", header=TRUE)
ch16_total <- merge(eqtlGen16,eqtl_MAF,by="SNP")
# Calculate betas and SEs
n=30596
ch16_total$Beta=ch16_total$Zscore/sqrt(2*(ch16_total$AlleleB_all)*
(1-ch16_total$AlleleB_all)*(n+ch16_total$Zscore^2))
ch16_total$SE=1/sqrt(2*(ch16_total$AlleleB_all)*
(1-ch16_total$AlleleB_all)*(n+ch16_total$Zscore^2))
write.table(ch16_total, file = "/n/home04/cdadams/coloc/ch16/ch16_total_maf_beta_se.txt", sep = "\t",
row.names = FALSE, col.names = TRUE, quote=FALSE)
# Order ch16 by position to select the region around rs12924886 on ch16
# The longevity GWAS used GRCh37/hg19; go to UCSC Browser and look up the snp in GRCh37/hg19
# Get the position: 72075343
# Select the SNPs within 1MB of 72075343
ch16_total=ch16_total[order(ch16_total$SNPChr),]
72075343+1000000
72075343-1000000
# Set: 71075343-73075343
region1=ch16_total[which(ch16_total$SNPPos>=71075343),]
region2=region1[which(region1$SNPPos<73075343),]
write.table(region2, file = "/n/home04/cdadams/coloc/ch16/rs12924886_1MB.txt", sep = "\t",
row.names = FALSE, col.names = TRUE, quote=FALSE)
# Select the SNPs within 3MB of 72075343
ch16_total=ch16_total[order(ch16_total$SNPChr),]
head(ch16_total)
72075343+3000000
72075343-3000000
# Set: 69075343-75075343)
region3=ch16_total[which(ch16_total$SNPPos>=69075343),]
region4=region1[which(region1$SNPPos<75075343),]
write.table(region4, file = "/n/home04/cdadams/coloc/ch16/rs12924886_3MB.txt", sep = "\t",
row.names = FALSE, col.names = TRUE, quote=FALSE)
I used MR-Base formatting to extract the SNPs from each GWAS. COLOC needs the SNPs to be set up as a matrix. (I kept some of the boiler-plate syntax for use with MR.)
exposure_ch16 <- read_exposure_data(
filename = "/n/home04/cdadams/coloc/ch16/rs12924886_3MB.txt", #ch16_total_maf_beta_se.txt
sep = '\t',
snp_col = 'SNP',
beta_col = 'Beta',
se_col = 'SE',
effect_allele_col = 'AssessedAllele',
#phenotype_col = 'eqtls_ch16',
#units_col = 'tissue',
other_allele_col = 'OtherAllele',
eaf_col = 'AlleleB_all',
#samplesize_col = 'samplesize',
#ncase_col = 'ncase',
#ncontrol_col = 'ncontrol',
gene_col = 'GeneSymbol',
pval_col = 'Pvalue',
id_col = 'eqtlGEN_ch16'
)
# NB: not clumped for LD
#This uses the unzipped longevity data without changing the original headers
long <- read_outcome_data(
snps = exposure_ch16$SNP,
filename = "/n/home04/cdadams/coloc/ch16/long_for_mr.txt", sep = '\t',
snp_col = 'rsid',
beta_col = 'beta1',
se_col = 'se',
effect_allele_col = 'a1',
#phenotype_col = 'tissue',
#units_col = 'tissue',
other_allele_col = 'a0',
eaf_col = 'freq1',
#samplesize_col = 'samplesize',
#ncase_col = 'ncase',
#ncontrol_col = 'ncontrol',
#gene_col = 'gene_name',
pval_col = 'p',
id_col = 'longevity'
)
#Harmonize the alleles btw the two gwas
ch16_long <- harmonise_data(exposure_ch16, long , action = 2)
dim(ch16_long)
write.csv(ch16_long, '/n/home04/cdadams/coloc/ch16/ch16_long.csv')
\[ Pr(\theta | y) = \frac{Pr(y | \theta) Pr(\theta)}{Pr(y)} \]
\[
Pr(\theta | y) \propto Pr(y | \theta) Pr(\theta)
\]
Language by Wallace (2014) (Giambartolomei et al. 2014) (creator of the method)
The idea behind the ABF analysis is that the association of each trait with SNPs in a region may be summarised by a vector of 0s and at most a single 1, with the 1 indicating the causal SNP (so, assuming a single causal SNP for each trait).
The posterior probability of each possible configuration can be calculated and so, crucially, can the posterior probabilities that the traits share their configurations. This allows us to estimate the support for the following cases:
- \(H_0\): neither trait has a genetic association in the region
- \(H_1\): only trait 1 has a genetic association in the region
- \(H_2\): only trait 2 has a genetic association in the region
- \(H_3\): both traits are associated, but with different causal variants
- \(H_4\): both traits are associated and share a single causal variant
NB: The paper uses a \(H_4\) of >75% as evidence of colocalization, & there’s a discussion about issues when \(H_3\) is > \(H_4\)
coloc.abf functionThe function
coloc.abfis ideally suited to the case when only summary data are available, and requires, for each trait, either:
- regression coefficients for each SNP
- variance of these regression coefficients.
It is now possible to compute the probabilities of interest.
With coefficients and std. deviation of Y known, we have the most accurate inference. NB: for a case-control trait, set type=“cc” and supply s, the proportion of samples in the dataset that are cases (from which sdY can be derived), instead of giving sdY directly.
library(coloc)
ch16_long=read.csv("ch16_long.csv")
my.res.ch16_long<- coloc.abf(dataset1=list(beta=ch16_long$beta.exposure,
varbeta=ch16_long$se.exposure, N=30596,type="quant"),
dataset2=list(beta=ch16_long$beta.outcome,
varbeta=ch16_long$se.outcome, N=1012240,type="quant"),
MAF=ch16_long$eaf.exposure)
## Warning in sdY.est(d$varbeta, d$MAF, d$N): estimating sdY from maf and varbeta,
## please directly supply sdY if known
## Warning in sdY.est(d$varbeta, d$MAF, d$N): estimating sdY from maf and varbeta,
## please directly supply sdY if known
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 2.51e-09 9.82e-01 4.21e-11 1.65e-02 1.06e-03
## [1] "PP abf for shared variant: 0.106%"
print(my.res.ch16_long[[1]])
## nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 1.071300e+04 2.505541e-09 9.824416e-01 4.208421e-11 1.650048e-02 1.057960e-03
If strong evidence for H4 (>75%), can extract the posterior probabilities for each SNP to be causal conditional on H4 being true. (This is part of the calculation required by
coloc, and contained in the column SNP.PP.H4 in the “results” element of the returned list.) Extract the more likely causal variants by
candidates=subset(my.res.ch16_long$results,SNP.PP.H4>0.01)
#candidates
Or the 95% credible set by
o <- order(my.res.ch16_long$results$SNP.PP.H4,decreasing=TRUE)
cs <- cumsum(my.res.ch16_long$results$SNP.PP.H4[o])
w <- which(cs > 0.95)[1]
credible_set=my.res.ch16_long$results[o,][1:w,]$snp
#credible_set
Save at my Rpubs
Giambartolomei, Claudia, Damjan Vukcevic, Eric E. Schadt, Lude Franke, Aroon D. Hingorani, Chris Wallace, and Vincent Plagnol. 2014. “Bayesian Test for Colocalisation Between Pairs of Genetic Association Studies Using Summary Statistics.” PLoS Genet 10 (5). https://doi.org/10.1371/journal.pgen.1004383.