I have a separate bash script: MR-mqtldb-eqtlGEN.sh that Iām tracking. I have not yet decided what I prefer:
Below are just boiler plate (helpful commands)
# Helpful commands
# Getting numer of lines
wc -l nucs_total_maf_beta_se.txt
# Listing by file type
ls *.Rmd | head -n 5
ls *.csv | head -n 5
ls *_mr.csv | head -n 5
# Removing directories with protected files
rm -f -R /n/home04/cdadams/MR-eqtlGen-long #rm a directory with a protected file
# Viewing hidden files
ls -dl .[^.]* ~
# Removing git to re-initialize
rm -rf .git
rm README.md
# Moving
mv *_mr.csv /n/home04/cdadams/MR-eqtlGen-bc
mv *_clump3.csv /n/home04/cdadams/MR-eqtlGen-bc
setwd('/n/home04/cdadams/MR-mqtldb-eqtlGEN')
# Read-in the RP/NUC set
combo4=read.csv("/n/home04/cdadams/ld/ldsc/combo4.csv")
# Read-in the mqtldb data from MR-Base
exposure_dat_aries <- format_aries_mqtl(subset(aries_mqtl, gene %in% combo4$gene & cis_trans=="cis"))
write.csv(exposure_dat_aries, 'mqtldb.csv')
\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]
# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
#
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported
# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/longevity/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory:
# salloc -p test --mem 5000 -t 0-06:00
#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data
# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis:
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz
# Column Names
# ------------
# SNP - SNP rs ID
# hg19_chr - chr number
# hg19_pos - SNP position (hg19)
# AlleleA - Other allele
# AlleleB - Assessed allele
# allA_total - Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total - Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele
# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except
# Framingham Heart Study, because the related samples are present in this cohort.
#-----------------------------------------------------------------------------------------------------------------------
setwd('/n/home04/cdadams/MR-eqtlGen-long')
combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-long/combo4.csv')
tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')
# Extract NUCs
nucs=tbl[which(tbl$GeneSymbol %in% combo4$gene),]
# Read-in MAFs
eqtl_MAF=read.table(file = "2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt",
sep = "\t", header=TRUE)
nucs_total <- merge(nucs,eqtl_MAF,by="SNP")
# Calculate betas and SEs
n=30596
nucs_total$Beta=nucs_total$Zscore/sqrt(2*(nucs_total$AlleleB_all)*
(1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
nucs_total$SE=1/sqrt(2*(nucs_total$AlleleB_all)*
(1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
write.table(nucs_total, file = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt", sep = "\t",
row.names = FALSE, col.names = TRUE, quote=FALSE)
# gznucs <- gzfile("/n/home04/cdadams/MR-eqtlGen-long/nucs.gz", "w")
# write.csv(nucs, gznucs)
# close(gznucs)
# Read-in both data sources for MR formatting
# Read-in the saved NUC mQTLs from mqtldb
exposure_dat_aries=read.csv("mqtldb.csv")
# Read-in the eqtlGen NUC eqtls
outcome_nuc_eqtlGen <- read_outcome_data(
snps = exposure_dat_aries$SNP,
filename = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt",
sep = '\t',
snp_col = 'SNP',
beta_col = 'Beta',
se_col = 'SE',
effect_allele_col = 'AssessedAllele',
phenotype_col = 'GeneSymbol',
#units_col = 'tissue',
other_allele_col = 'OtherAllele',
eaf_col = 'AlleleB_all',
#samplesize_col = 'samplesize',
#ncase_col = 'ncase',
#ncontrol_col = 'ncontrol',
gene_col = 'GeneSymbol',
pval_col = 'Pvalue',
id_col = 'GeneSymbol'
)
dat <- harmonise_data(exposure_dat_aries, outcome_nuc_eqtlGen, action = 2)
write.csv(dat, "harmonized_dat.csv")
# Select just the NUC mQTLs that correspond to NUC eQTLs
same=dat[which(dat$gene.exposure==dat$outcome),]
write.csv(same, "same.csv")
# MR
# Read-in the harmonized data for mQTLs of NUC genes
same=read.csv("/n/home04/cdadams/MR-mqtldb-eqtlGEN/same.csv")
dat=read.csv("/n/home04/cdadams/MR-mqtldb-eqtlGEN/harmonized_dat.csv")
singlesnp_results=mr_singlesnp(same, parameters = default_parameters(),
single_method = "mr_wald_ratio")
# Order
singlesnp_results=singlesnp_results[order(singlesnp_results$SNP),]
# Remove the meta-analytic estimaotrs
singlesnp_results=singlesnp_results[!grepl("All - Inverse variance weighted", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("All - MR Egger", singlesnp_results$SNP),]
singlesnp_results=singlesnp_results[!grepl("No available data", singlesnp_results$SNP),]
singlesnp_results$samplesize <- 30596
#snp_res<- na.omit(singlesnp_results)
write.csv(singlesnp_results, "singlesnp_results.csv")
# Merge the data and results
dat_res_meth_exp=merge(dat, singlesnp_results, by=c("SNP", "exposure", "outcome"), all = TRUE)
dat_res_meth_exp<-subset(dat_res_meth_exp, (!is.na(dat_res_meth_exp$p)))
# FDR correction
p2=dat_res_meth_exp$p
fdr2=p.adjust(p2, method = "fdr", n = length(p2))
dat_res_meth_exp$fdr=fdr2
# Are the directions of methylation and expression in accordance with the hypothesis that
# increased methylation decreases expression and decreased methylation increases expression?
dat_res_meth_exp$concord1=ifelse(dat_res_meth_exp$beta.exposure<0 & dat_res_meth_exp$beta.outcome>0, "concord", "no")
dat_res_meth_exp$concord2=ifelse(dat_res_meth_exp$beta.exposure>0 & dat_res_meth_exp$beta.outcome<0, "concord", dat_res_meth_exp$concord1)
table(dat_res_meth_exp$concord2)
##
## concord no
## 698 507
# Fisher's test of count data
dat_res_meth_exp_fish <- matrix(c(698, 1205-698, 507, 1205-507), nrow = 2)
fish=fisher.test(dat_res_meth_exp_fish )
fish
##
## Fisher's Exact Test for Count Data
##
## data: dat_res_meth_exp_fish
## p-value = 8.729e-15
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.606926 2.235467
## sample estimates:
## odds ratio
## 1.894879
# Proportion's test
res_meth_exp <- prop.test(x = c(698, 507), n = c(1205, 1205))
res_meth_exp
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(698, 507) out of c(1205, 1205)
## X-squared = 59.917, df = 1, p-value = 9.894e-15
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.1182566 0.1987559
## sample estimates:
## prop 1 prop 2
## 0.5792531 0.4207469
# Top findings
myvars=c("SNP", "exposure", "outcome","beta.exposure","beta.outcome", "b", "se", "p", "fdr", "concord2")
res=dat_res_meth_exp[myvars]
head(res)
## SNP exposure outcome beta.exposure beta.outcome
## 1 rs10023020 cg19671926 (Adolescence) EXOSC9 -0.3943 -0.20709820
## 2 rs10098159 cg11401558 (Birth) RPL30 0.3452 -0.04601619
## 3 rs1010627 cg13379971 (Adolescence) EBNA1BP2 0.5905 0.26932713
## 4 rs10113292 cg26371346 (Childhood) RPS20 0.2507 -0.05084873
## 5 rs1013950 cg08333555 (Middle age) ERI2 -0.4513 0.06230175
## 7 rs1017137 cg02413092 (Middle age) NOM1 -0.3734 -0.08630208
## b se p fdr concord2
## 1 0.5252300 0.02104380 1.709485e-137 8.995327e-137 no
## 2 -0.1333030 0.02345397 1.318885e-08 1.869703e-08 concord
## 3 0.4561001 0.01547981 8.282096e-191 5.225092e-190 no
## 4 -0.2028270 0.04133001 9.224466e-07 1.254569e-06 concord
## 5 -0.1380495 0.01818834 3.199055e-14 5.814270e-14 concord
## 7 0.2311250 0.02487756 1.535652e-20 3.190450e-20 no
write.csv(dat_res_meth_exp, 'dat_res_meth_exp_all.csv')