Bash

I have a separate bash script: MR-mqtldb-eqtlGEN.sh that I’m tracking. I have not yet decided what I prefer:

  • integrating bash into .Rmd, or
  • having it separate as its own tracked file

Below are just boiler plate (helpful commands)

# Helpful commands

# Getting numer of lines
wc -l nucs_total_maf_beta_se.txt

# Listing by file type
ls *.Rmd | head -n 5
ls *.csv | head -n 5
ls *_mr.csv | head -n 5

# Removing directories with protected files 
rm -f -R /n/home04/cdadams/MR-eqtlGen-long #rm a directory with a protected file

# Viewing hidden files
ls -dl .[^.]* ~

# Removing git to re-initialize
rm -rf .git
rm README.md 

# Moving
mv *_mr.csv /n/home04/cdadams/MR-eqtlGen-bc
mv *_clump3.csv /n/home04/cdadams/MR-eqtlGen-bc

Format the mqtldb (DNA methylation in blood) data

setwd('/n/home04/cdadams/MR-mqtldb-eqtlGEN')

# Read-in the RP/NUC set
combo4=read.csv("/n/home04/cdadams/ld/ldsc/combo4.csv")

# Read-in the mqtldb data from MR-Base
exposure_dat_aries <- format_aries_mqtl(subset(aries_mqtl, gene %in% combo4$gene & cis_trans=="cis"))
write.csv(exposure_dat_aries, 'mqtldb.csv')

Format eqtlGen GWAS

  • eqtlGen stored MAFs in a separate file.
  • MAFs are needed for the calculation of the betas & SEs from the Z-scores.
  • Methods for this are provided by: Zhu 2016: https://www.nature.com/articles/ng.3538

\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]

# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
 
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
 
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
# 
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and  tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported

# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/longevity/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory: 
# salloc -p test  --mem 5000 -t 0-06:00

#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data 

# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis: 
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz

# Column Names
# ------------
# SNP -         SNP rs ID
# hg19_chr -    chr number
# hg19_pos -    SNP position (hg19)
# AlleleA -     Other allele
# AlleleB -     Assessed allele
# allA_total -  Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total -  Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele

# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except 
# Framingham Heart Study, because the related samples are present in this cohort.

#-----------------------------------------------------------------------------------------------------------------------
setwd('/n/home04/cdadams/MR-eqtlGen-long')
combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-long/combo4.csv')

tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')

# Extract NUCs
nucs=tbl[which(tbl$GeneSymbol %in% combo4$gene),]

# Read-in MAFs
eqtl_MAF=read.table(file = "2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt", 
                    sep = "\t", header=TRUE)
nucs_total <- merge(nucs,eqtl_MAF,by="SNP")

# Calculate betas and SEs
n=30596
nucs_total$Beta=nucs_total$Zscore/sqrt(2*(nucs_total$AlleleB_all)*
                  (1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
nucs_total$SE=1/sqrt(2*(nucs_total$AlleleB_all)*
                  (1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))

write.table(nucs_total, file = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote=FALSE)

# gznucs <- gzfile("/n/home04/cdadams/MR-eqtlGen-long/nucs.gz", "w")
# write.csv(nucs, gznucs)
# close(gznucs)

Harmonize

# Read-in both data sources for MR formatting

# Read-in the saved NUC mQTLs from mqtldb 
exposure_dat_aries=read.csv("mqtldb.csv")

# Read-in the eqtlGen NUC eqtls
outcome_nuc_eqtlGen <- read_outcome_data(
  snps = exposure_dat_aries$SNP,
  filename = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt",
  sep = '\t',
  snp_col = 'SNP',
  beta_col = 'Beta',
  se_col = 'SE',
  effect_allele_col = 'AssessedAllele',
  phenotype_col = 'GeneSymbol',
  #units_col = 'tissue',
  other_allele_col = 'OtherAllele',
  eaf_col = 'AlleleB_all',
  #samplesize_col = 'samplesize',
  #ncase_col = 'ncase',
  #ncontrol_col = 'ncontrol',
  gene_col = 'GeneSymbol',
  pval_col = 'Pvalue', 
  id_col = 'GeneSymbol'
)

dat <- harmonise_data(exposure_dat_aries, outcome_nuc_eqtlGen, action = 2)
write.csv(dat, "harmonized_dat.csv")

# Select just the NUC mQTLs that correspond to NUC eQTLs
same=dat[which(dat$gene.exposure==dat$outcome),]
write.csv(same, "same.csv")

Results

# MR
# Read-in the harmonized data for mQTLs of NUC genes
same=read.csv("/n/home04/cdadams/MR-mqtldb-eqtlGEN/same.csv")
dat=read.csv("/n/home04/cdadams/MR-mqtldb-eqtlGEN/harmonized_dat.csv")

singlesnp_results=mr_singlesnp(same, parameters = default_parameters(),
   single_method = "mr_wald_ratio")

# Order
singlesnp_results=singlesnp_results[order(singlesnp_results$SNP),]

# Remove the meta-analytic estimaotrs
singlesnp_results=singlesnp_results[!grepl("All - Inverse variance weighted", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("All - MR Egger", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("No available data", singlesnp_results$SNP),]
singlesnp_results$samplesize <- 30596

#snp_res<- na.omit(singlesnp_results)
write.csv(singlesnp_results, "singlesnp_results.csv")

# Merge the data and results
dat_res_meth_exp=merge(dat, singlesnp_results, by=c("SNP", "exposure", "outcome"), all = TRUE)
dat_res_meth_exp<-subset(dat_res_meth_exp, (!is.na(dat_res_meth_exp$p)))

# FDR correction
p2=dat_res_meth_exp$p
fdr2=p.adjust(p2, method = "fdr", n = length(p2))
dat_res_meth_exp$fdr=fdr2

# Are the directions of methylation and expression in accordance with the hypothesis that 
# increased methylation decreases expression and decreased methylation increases expression?

dat_res_meth_exp$concord1=ifelse(dat_res_meth_exp$beta.exposure<0 & dat_res_meth_exp$beta.outcome>0, "concord", "no")
dat_res_meth_exp$concord2=ifelse(dat_res_meth_exp$beta.exposure>0 & dat_res_meth_exp$beta.outcome<0, "concord", dat_res_meth_exp$concord1)
table(dat_res_meth_exp$concord2)
## 
## concord      no 
##     698     507
# Fisher's test of count data
dat_res_meth_exp_fish <- matrix(c(698, 1205-698, 507, 1205-507), nrow = 2)
fish=fisher.test(dat_res_meth_exp_fish )
fish
## 
##  Fisher's Exact Test for Count Data
## 
## data:  dat_res_meth_exp_fish
## p-value = 8.729e-15
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.606926 2.235467
## sample estimates:
## odds ratio 
##   1.894879
# Proportion's test
res_meth_exp <- prop.test(x = c(698, 507), n = c(1205, 1205))
res_meth_exp
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(698, 507) out of c(1205, 1205)
## X-squared = 59.917, df = 1, p-value = 9.894e-15
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1182566 0.1987559
## sample estimates:
##    prop 1    prop 2 
## 0.5792531 0.4207469
# Top findings
myvars=c("SNP", "exposure", "outcome","beta.exposure","beta.outcome", "b", "se", "p", "fdr", "concord2")
res=dat_res_meth_exp[myvars]
head(res)
##          SNP                 exposure  outcome beta.exposure beta.outcome
## 1 rs10023020 cg19671926 (Adolescence)   EXOSC9       -0.3943  -0.20709820
## 2 rs10098159       cg11401558 (Birth)    RPL30        0.3452  -0.04601619
## 3  rs1010627 cg13379971 (Adolescence) EBNA1BP2        0.5905   0.26932713
## 4 rs10113292   cg26371346 (Childhood)    RPS20        0.2507  -0.05084873
## 5  rs1013950  cg08333555 (Middle age)     ERI2       -0.4513   0.06230175
## 7  rs1017137  cg02413092 (Middle age)     NOM1       -0.3734  -0.08630208
##            b         se             p           fdr concord2
## 1  0.5252300 0.02104380 1.709485e-137 8.995327e-137       no
## 2 -0.1333030 0.02345397  1.318885e-08  1.869703e-08  concord
## 3  0.4561001 0.01547981 8.282096e-191 5.225092e-190       no
## 4 -0.2028270 0.04133001  9.224466e-07  1.254569e-06  concord
## 5 -0.1380495 0.01818834  3.199055e-14  5.814270e-14  concord
## 7  0.2311250 0.02487756  1.535652e-20  3.190450e-20       no
write.csv(dat_res_meth_exp, 'dat_res_meth_exp_all.csv')