Contents

0.1 Format mqtldb

setwd('/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN')

# Read-in the RP set
#combo4=read.csv("/n/home04/cdadams/ld/ldsc/combo4.csv")
source("/n/home04/cdadams/RP_lists/RP_lists.R")

# Read-in the mqtldb data from MR-Base
exposure_dat_aries <- format_aries_mqtl(subset(aries_mqtl, gene %in% rp_list_expanded$gene & cis_trans=="cis"))
write.csv(exposure_dat_aries, '/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/mqtldb.csv')

0.2 Format eqtlGen

\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]

# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
 
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
 
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
# 
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and  tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported

# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/longevity/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory: 
# salloc -p test  --mem 5000 -t 0-06:00

#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data 

# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis: 
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz

# Column Names
# ------------
# SNP -         SNP rs ID
# hg19_chr -    chr number
# hg19_pos -    SNP position (hg19)
# AlleleA -     Other allele
# AlleleB -     Assessed allele
# allA_total -  Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total -  Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele

# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except 
# Framingham Heart Study, because the related samples are present in this cohort.

#-----------------------------------------------------------------------------------------------------------------------
#setwd('/n/home04/cdadams/MR-eqtlGen-long')
#combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-long/combo4.csv')
source("/n/home04/cdadams/RP_lists/RP_lists.R")

tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')

# Extract RPs
RPs=tbl[which(tbl$GeneSymbol %in% rp_list_expanded$gene),]

# Read-in MAFs
eqtl_MAF=read.table(file = "/n/home04/cdadams/MR-eqtlGen-long/2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt", 
                    sep = "\t", header=TRUE)
RPs_total <- merge(RPs,eqtl_MAF,by="SNP")

# Calculate betas and SEs
n=30596
RPs_total$Beta=RPs_total$Zscore/sqrt(2*(RPs_total$AlleleB_all)*
                  (1-RPs_total$AlleleB_all)*(n+RPs_total$Zscore^2))
RPs_total$SE=1/sqrt(2*(RPs_total$AlleleB_all)*
                  (1-RPs_total$AlleleB_all)*(n+RPs_total$Zscore^2))

write.table(RPs_total, file = "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/RPs_total_maf_beta_se.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote=FALSE)

# gzRPs <- gzfile("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/RPs.gz", "w")
# write.csv(RPs, gzRPs)
# close(gzRPs)

0.3 Harmonize

# Read-in both data sources for MR formatting

# Read-in the saved RP mQTLs from mqtldb 
exposure_dat_aries=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGENmqtldb.csv")

# Read-in the eqtlGen RP eqtls
outcome_RP_eqtlGen <- read_outcome_data(
  snps = exposure_dat_aries$SNP,
  filename = "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/RPs_total_maf_beta_se.txt",
  sep = '\t',
  snp_col = 'SNP',
  beta_col = 'Beta',
  se_col = 'SE',
  effect_allele_col = 'AssessedAllele',
  phenotype_col = 'GeneSymbol',
  #units_col = 'tissue',
  other_allele_col = 'OtherAllele',
  eaf_col = 'AlleleB_all',
  #samplesize_col = 'samplesize',
  #ncase_col = 'ncase',
  #ncontrol_col = 'ncontrol',
  gene_col = 'GeneSymbol',
  pval_col = 'Pvalue', 
  id_col = 'GeneSymbol'
)

dat <- harmonise_data(exposure_dat_aries, outcome_RP_eqtlGen, action = 2)
write.csv(dat, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")

# Select just the RP mQTLs that correspond to RP eQTLs
same=dat[which(dat$gene.exposure==dat$outcome),]
write.csv(same, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/same.csv")

0.4 Descriptive: mqtldb

# Descriptives 

# Read-in the RP/RP set
source("/n/home04/cdadams/RP_lists/RP_lists.R")

# Read-in the mqtldb data from MR-Base
exposure_dat_aries <- read.csv('/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/mqtldb.csv')

# Create the timepoints var
# Remove all characters before the first space
exposure_dat_aries$times=sub("^\\S+\\s+", '', exposure_dat_aries$exposure)

# Remove all ()
exposure_dat_aries$times=str_remove_all(exposure_dat_aries$times, "[()]")

tab_times=table(exposure_dat_aries$exposure)
tab_times=as.data.frame(tab_times)
tab_times$Times=tab_times$Var1
tab_times$Var1=NULL

#exposure_dat_aries$gene.exposure=droplevels(exposure_dat_aries$gene.exposure)
tab_gene=table(exposure_dat_aries$gene.exposure)
tab_gene=as.data.frame(tab_gene)
tab_gene$Gene=tab_gene$Var1
tab_gene$Var1=NULL

exposure_dat_aries$cpg <- gsub(" .*$", "", exposure_dat_aries$exposure)
tab_cpg=table(exposure_dat_aries$cpg)
tab_cpg=as.data.frame(tab_cpg)
tab_cpg$CpG=tab_cpg$Var1
tab_cpg$Var1=NULL

tab_SNP=table(exposure_dat_aries$SNP)
tab_SNP=as.data.frame(tab_SNP)
tab_SNP$SNP=tab_SNP$Var1
tab_SNP$Var1=NULL

summary1 <-
  list("SNPs" =
       list("SNP" = tab_SNP),
       "CpG" =
       list("CpG" = tab_cpg),
       "Time points" =
       list("Time points" = tab_times),
       "RP genes" =
       list("RP genes" = tab_gene)
       )
# Get descriptives
table(exposure_dat_aries$times)
## 
## Adolescence       Birth   Childhood  Middle age   Pregnancy 
##         101          92          96          94         100
dim(tab_gene)
## [1] 57  2
dim(tab_cpg)
## [1] 90  2
dim(tab_SNP)
## [1] 276   2
dim(tab_times)
## [1] 424   2

0.5 Descriptives: merged mqtldb & eqtlGen

# Descriptives 
same=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/same.csv")
tab_gene2=table(same$gene.exposure)
tab_gene2=as.data.frame(tab_gene2)
tab_gene2$Gene=tab_gene2$Var1
tab_gene2$Var1=NULL

same$cpg <- gsub(" .*$", "", same$exposure)
tab_cpg2=table(same$cpg)
tab_cpg2=as.data.frame(tab_cpg2)
tab_cpg2$CpG=tab_cpg2$Var1
tab_cpg2$Var1=NULL

tab_SNP2=table(same$SNP)
tab_SNP2=as.data.frame(tab_SNP2)
tab_SNP2$SNP=tab_SNP2$Var1
tab_SNP2$Var1=NULL

tab_times2=table(same$exposure)
tab_times2=as.data.frame(tab_times2)
tab_times2$Times=tab_times2$Var1
tab_times2$Var1=NULL

summary2 <-
  list("SNPs" =
       list("SNP" = tab_SNP2),
       "CpG" =
       list("CpG" = tab_cpg2),
       "Time points" =
       list("Time points" = tab_times2),
       "RP genes" =
       list("RP genes" = tab_gene2)
       )

# Ideas for reformatting the summary tables
# test=append(summary1,summary2)
# test3=do.call(c, list(summary1,summary2))
# combineLists <- function(manyLists){
#     library(plyr)
#     newLists <- list()
#     for(ixList in 1:length(manyLists)){
#         tmpList <- lapply(manyLists[[ixList]], paste, sep = "", collapse = ", ")
#         tmpVec  <- as.character(tmpList)
#         newLists[[ixList]] <- tmpVec
#     }
#     newDF   <- t(ldply(newLists))
#     return(newDF)
# }
# 
# test4=combineLists(list(summary1,summary2))
# this_df1=bind_rows(summary2)
# this_df2 <- Reduce(function(x, y) merge(x, y, all = TRUE), summary2)

# Create the timepoints var
dat=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")

# Remove all characters before the first space
dat$times=sub("^\\S+\\s+", '', dat$exposure)

# Remove all ()
dat$times=str_remove_all(dat$times, "[()]")
write.csv(dat, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")

# Get descriptives
table(dat$times)
## 
## Adolescence       Birth   Childhood  Middle age   Pregnancy 
##         111         103         106          93         104
dim(tab_gene2)
## [1] 56  2
dim(tab_cpg2)
## [1] 87  2
dim(tab_SNP2)
## [1] 242   2
dim(tab_times2)
## [1] 373   2

0.6 Results

# MR
# Read-in the harmonized data for mQTLs of RP genes
same=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/same.csv")
#dat=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")

singlesnp_results=mr_singlesnp(same, parameters = default_parameters(),
   single_method = "mr_wald_ratio")

# Order
singlesnp_results=singlesnp_results[order(singlesnp_results$SNP),]

# Remove the meta-analytic estimaotrs
singlesnp_results=singlesnp_results[!grepl("All - Inverse variance weighted", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("All - MR Egger", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("No available data", singlesnp_results$SNP),]
singlesnp_results$samplesize <- 30596

#snp_res<- na.omit(singlesnp_results)
write.csv(singlesnp_results, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/singlesnp_results.csv")

# Merge the data and results
dat_res_meth_exp=merge(same, singlesnp_results, by=c("SNP", "exposure", "outcome"), all = TRUE)
dat_res_meth_exp<-subset(dat_res_meth_exp, (!is.na(dat_res_meth_exp$p)))

# FDR correction
p2=dat_res_meth_exp$p
fdr2=p.adjust(p2, method = "fdr", n = length(p2))
dat_res_meth_exp$fdr=fdr2

# Are the directions of methylation and expression in accordance with the hypothesis that 
# increased methylation decreases expression and decreased methylation increases expression?

dat_res_meth_exp$concord1=ifelse(dat_res_meth_exp$beta.exposure<0 & dat_res_meth_exp$beta.outcome>0, "concord", "no")
dat_res_meth_exp$concord2=ifelse(dat_res_meth_exp$beta.exposure>0 & dat_res_meth_exp$beta.outcome<0, "concord", dat_res_meth_exp$concord1)
table(dat_res_meth_exp$concord2)
## 
## concord      no 
##     221     169
# Fisher's test of count data
dat_res_meth_exp_fish <- matrix(c(221, 390-221, 169, 390-169), nrow = 2)
fish=fisher.test(dat_res_meth_exp_fish )
fish
## 
##  Fisher's Exact Test for Count Data
## 
## data:  dat_res_meth_exp_fish
## p-value = 0.0002552
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.275378 2.293141
## sample estimates:
## odds ratio 
##   1.708899
# Proportion's test
res_meth_exp <- prop.test(x = c(221, 169), n = c(390, 390))
res_meth_exp
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(221, 169) out of c(390, 390)
## X-squared = 13.338, df = 1, p-value = 0.00026
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.06121786 0.20544880
## sample estimates:
##    prop 1    prop 2 
## 0.5666667 0.4333333
# Top findings
myvars=c("SNP", "exposure", "outcome","beta.exposure","beta.outcome", "b", "se", "p", "fdr", "concord2")
res=dat_res_meth_exp[myvars]
res=res[order(res$fdr),]

# Round just the numerical variables
round_res <- function(res, digits) {
  nums <- vapply(res, is.numeric, FUN.VALUE = logical(1))

  res[,nums] <- round(res[,nums], digits = digits)

  (res)
}

res=round_res(res, digits=3)
datatable(res, options = list(pageLength = 5))

0.7 Rpubs

Link: https://rpubs.com/Charleen_D_Adams/752947