Changes in the levels of DNA methylation of the RPs may impact their expression.
setwd('/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN')
# Read-in the RP set
#combo4=read.csv("/n/home04/cdadams/ld/ldsc/combo4.csv")
source("/n/home04/cdadams/RP_lists/RP_lists.R")
# Read-in the mqtldb data from MR-Base
exposure_dat_aries <- format_aries_mqtl(subset(aries_mqtl, gene %in% rp_list_expanded$gene & cis_trans=="cis"))
write.csv(exposure_dat_aries, '/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/mqtldb.csv')
\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]
# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
#
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported
# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/longevity/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory:
# salloc -p test --mem 5000 -t 0-06:00
#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data
# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis:
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz
# Column Names
# ------------
# SNP - SNP rs ID
# hg19_chr - chr number
# hg19_pos - SNP position (hg19)
# AlleleA - Other allele
# AlleleB - Assessed allele
# allA_total - Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total - Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele
# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except
# Framingham Heart Study, because the related samples are present in this cohort.
#-----------------------------------------------------------------------------------------------------------------------
#setwd('/n/home04/cdadams/MR-eqtlGen-long')
#combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-long/combo4.csv')
source("/n/home04/cdadams/RP_lists/RP_lists.R")
tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')
# Extract RPs
RPs=tbl[which(tbl$GeneSymbol %in% rp_list_expanded$gene),]
# Read-in MAFs
eqtl_MAF=read.table(file = "/n/home04/cdadams/MR-eqtlGen-long/2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt",
sep = "\t", header=TRUE)
RPs_total <- merge(RPs,eqtl_MAF,by="SNP")
# Calculate betas and SEs
n=30596
RPs_total$Beta=RPs_total$Zscore/sqrt(2*(RPs_total$AlleleB_all)*
(1-RPs_total$AlleleB_all)*(n+RPs_total$Zscore^2))
RPs_total$SE=1/sqrt(2*(RPs_total$AlleleB_all)*
(1-RPs_total$AlleleB_all)*(n+RPs_total$Zscore^2))
write.table(RPs_total, file = "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/RPs_total_maf_beta_se.txt", sep = "\t",
row.names = FALSE, col.names = TRUE, quote=FALSE)
# gzRPs <- gzfile("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/RPs.gz", "w")
# write.csv(RPs, gzRPs)
# close(gzRPs)
# Read-in both data sources for MR formatting
# Read-in the saved RP mQTLs from mqtldb
exposure_dat_aries=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGENmqtldb.csv")
# Read-in the eqtlGen RP eqtls
outcome_RP_eqtlGen <- read_outcome_data(
snps = exposure_dat_aries$SNP,
filename = "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/RPs_total_maf_beta_se.txt",
sep = '\t',
snp_col = 'SNP',
beta_col = 'Beta',
se_col = 'SE',
effect_allele_col = 'AssessedAllele',
phenotype_col = 'GeneSymbol',
#units_col = 'tissue',
other_allele_col = 'OtherAllele',
eaf_col = 'AlleleB_all',
#samplesize_col = 'samplesize',
#ncase_col = 'ncase',
#ncontrol_col = 'ncontrol',
gene_col = 'GeneSymbol',
pval_col = 'Pvalue',
id_col = 'GeneSymbol'
)
dat <- harmonise_data(exposure_dat_aries, outcome_RP_eqtlGen, action = 2)
write.csv(dat, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")
# Select just the RP mQTLs that correspond to RP eQTLs
same=dat[which(dat$gene.exposure==dat$outcome),]
write.csv(same, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/same.csv")
# Descriptives
# Read-in the RP/RP set
source("/n/home04/cdadams/RP_lists/RP_lists.R")
# Read-in the mqtldb data from MR-Base
exposure_dat_aries <- read.csv('/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/mqtldb.csv')
# Create the timepoints var
# Remove all characters before the first space
exposure_dat_aries$times=sub("^\\S+\\s+", '', exposure_dat_aries$exposure)
# Remove all ()
exposure_dat_aries$times=str_remove_all(exposure_dat_aries$times, "[()]")
tab_times=table(exposure_dat_aries$exposure)
tab_times=as.data.frame(tab_times)
tab_times$Times=tab_times$Var1
tab_times$Var1=NULL
#exposure_dat_aries$gene.exposure=droplevels(exposure_dat_aries$gene.exposure)
tab_gene=table(exposure_dat_aries$gene.exposure)
tab_gene=as.data.frame(tab_gene)
tab_gene$Gene=tab_gene$Var1
tab_gene$Var1=NULL
exposure_dat_aries$cpg <- gsub(" .*$", "", exposure_dat_aries$exposure)
tab_cpg=table(exposure_dat_aries$cpg)
tab_cpg=as.data.frame(tab_cpg)
tab_cpg$CpG=tab_cpg$Var1
tab_cpg$Var1=NULL
tab_SNP=table(exposure_dat_aries$SNP)
tab_SNP=as.data.frame(tab_SNP)
tab_SNP$SNP=tab_SNP$Var1
tab_SNP$Var1=NULL
summary1 <-
list("SNPs" =
list("SNP" = tab_SNP),
"CpG" =
list("CpG" = tab_cpg),
"Time points" =
list("Time points" = tab_times),
"RP genes" =
list("RP genes" = tab_gene)
)
# Get descriptives
table(exposure_dat_aries$times)
##
## Adolescence Birth Childhood Middle age Pregnancy
## 101 92 96 94 100
dim(tab_gene)
## [1] 57 2
dim(tab_cpg)
## [1] 90 2
dim(tab_SNP)
## [1] 276 2
dim(tab_times)
## [1] 424 2
# Descriptives
same=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/same.csv")
tab_gene2=table(same$gene.exposure)
tab_gene2=as.data.frame(tab_gene2)
tab_gene2$Gene=tab_gene2$Var1
tab_gene2$Var1=NULL
same$cpg <- gsub(" .*$", "", same$exposure)
tab_cpg2=table(same$cpg)
tab_cpg2=as.data.frame(tab_cpg2)
tab_cpg2$CpG=tab_cpg2$Var1
tab_cpg2$Var1=NULL
tab_SNP2=table(same$SNP)
tab_SNP2=as.data.frame(tab_SNP2)
tab_SNP2$SNP=tab_SNP2$Var1
tab_SNP2$Var1=NULL
tab_times2=table(same$exposure)
tab_times2=as.data.frame(tab_times2)
tab_times2$Times=tab_times2$Var1
tab_times2$Var1=NULL
summary2 <-
list("SNPs" =
list("SNP" = tab_SNP2),
"CpG" =
list("CpG" = tab_cpg2),
"Time points" =
list("Time points" = tab_times2),
"RP genes" =
list("RP genes" = tab_gene2)
)
# Ideas for reformatting the summary tables
# test=append(summary1,summary2)
# test3=do.call(c, list(summary1,summary2))
# combineLists <- function(manyLists){
# library(plyr)
# newLists <- list()
# for(ixList in 1:length(manyLists)){
# tmpList <- lapply(manyLists[[ixList]], paste, sep = "", collapse = ", ")
# tmpVec <- as.character(tmpList)
# newLists[[ixList]] <- tmpVec
# }
# newDF <- t(ldply(newLists))
# return(newDF)
# }
#
# test4=combineLists(list(summary1,summary2))
# this_df1=bind_rows(summary2)
# this_df2 <- Reduce(function(x, y) merge(x, y, all = TRUE), summary2)
# Create the timepoints var
dat=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")
# Remove all characters before the first space
dat$times=sub("^\\S+\\s+", '', dat$exposure)
# Remove all ()
dat$times=str_remove_all(dat$times, "[()]")
write.csv(dat, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")
# Get descriptives
table(dat$times)
##
## Adolescence Birth Childhood Middle age Pregnancy
## 111 103 106 93 104
dim(tab_gene2)
## [1] 56 2
dim(tab_cpg2)
## [1] 87 2
dim(tab_SNP2)
## [1] 242 2
dim(tab_times2)
## [1] 373 2
# MR
# Read-in the harmonized data for mQTLs of RP genes
same=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/same.csv")
#dat=read.csv("/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/harmonized_dat.csv")
singlesnp_results=mr_singlesnp(same, parameters = default_parameters(),
single_method = "mr_wald_ratio")
# Order
singlesnp_results=singlesnp_results[order(singlesnp_results$SNP),]
# Remove the meta-analytic estimaotrs
singlesnp_results=singlesnp_results[!grepl("All - Inverse variance weighted", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("All - MR Egger", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("No available data", singlesnp_results$SNP),]
singlesnp_results$samplesize <- 30596
#snp_res<- na.omit(singlesnp_results)
write.csv(singlesnp_results, "/n/home04/cdadams/MR-mqtldb-RP-eqtlGEN/singlesnp_results.csv")
# Merge the data and results
dat_res_meth_exp=merge(same, singlesnp_results, by=c("SNP", "exposure", "outcome"), all = TRUE)
dat_res_meth_exp<-subset(dat_res_meth_exp, (!is.na(dat_res_meth_exp$p)))
# FDR correction
p2=dat_res_meth_exp$p
fdr2=p.adjust(p2, method = "fdr", n = length(p2))
dat_res_meth_exp$fdr=fdr2
# Are the directions of methylation and expression in accordance with the hypothesis that
# increased methylation decreases expression and decreased methylation increases expression?
dat_res_meth_exp$concord1=ifelse(dat_res_meth_exp$beta.exposure<0 & dat_res_meth_exp$beta.outcome>0, "concord", "no")
dat_res_meth_exp$concord2=ifelse(dat_res_meth_exp$beta.exposure>0 & dat_res_meth_exp$beta.outcome<0, "concord", dat_res_meth_exp$concord1)
table(dat_res_meth_exp$concord2)
##
## concord no
## 221 169
# Fisher's test of count data
dat_res_meth_exp_fish <- matrix(c(221, 390-221, 169, 390-169), nrow = 2)
fish=fisher.test(dat_res_meth_exp_fish )
fish
##
## Fisher's Exact Test for Count Data
##
## data: dat_res_meth_exp_fish
## p-value = 0.0002552
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.275378 2.293141
## sample estimates:
## odds ratio
## 1.708899
# Proportion's test
res_meth_exp <- prop.test(x = c(221, 169), n = c(390, 390))
res_meth_exp
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(221, 169) out of c(390, 390)
## X-squared = 13.338, df = 1, p-value = 0.00026
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.06121786 0.20544880
## sample estimates:
## prop 1 prop 2
## 0.5666667 0.4333333
# Top findings
myvars=c("SNP", "exposure", "outcome","beta.exposure","beta.outcome", "b", "se", "p", "fdr", "concord2")
res=dat_res_meth_exp[myvars]
res=res[order(res$fdr),]
# Round just the numerical variables
round_res <- function(res, digits) {
nums <- vapply(res, is.numeric, FUN.VALUE = logical(1))
res[,nums] <- round(res[,nums], digits = digits)
(res)
}
res=round_res(res, digits=3)
datatable(res, options = list(pageLength = 5))