Format eqtlGen GWAS

  • eqtlGen stored MAFs in a separate file.
  • MAFs are needed for the calculation of the betas & SEs from the Z-scores.
  • Methods for this are provided by: Zhu 2016: https://www.nature.com/articles/ng.3538

\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]

# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
 
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
 
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
# 
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and  tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported

# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/bc/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory: 
# salloc -p test  --mem 5000 -t 0-06:00

#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data 

# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis: 
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz

# Column Names
# ------------
# SNP -         SNP rs ID
# hg19_chr -    chr number
# hg19_pos -    SNP position (hg19)
# AlleleA -     Other allele
# AlleleB -     Assessed allele
# allA_total -  Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total -  Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele

# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except 
# Framingham Heart Study, because the related samples are present in this cohort.

#-----------------------------------------------------------------------------------------------------------------------
# Remember that the eqtlGen file is stored in /n/home04/cdadams/MR-eqtlGen-long
# Don't move to MR-eqtlGen-bc because it will result a storage over-capacity

setwd('/n/home04/cdadams/MR-eqtlGen-long/')
combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/combo4.csv')

tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')

# Extract NUCs
nucs=tbl[which(tbl$GeneSymbol %in% combo4$gene),]
head(tbl$Gene)

# Read-in MAFs
eqtl_MAF=read.table(file = "2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt", 
                    sep = "\t", header=TRUE)
nucs_total <- merge(nucs,eqtl_MAF,by="SNP")

# Calculate betas and SEs
n=30596
nucs_total$Beta=nucs_total$Zscore/sqrt(2*(nucs_total$AlleleB_all)*
                  (1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
nucs_total$SE=1/sqrt(2*(nucs_total$AlleleB_all)*
                  (1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))

write.table(nucs_total, file = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote=FALSE)

# gznucs <- gzfile("/n/home04/cdadams/MR-eqtlGen-bc/nucs.gz", "w")
# write.csv(nucs, gznucs)
# close(gznucs)

# Read-in for MR formatting
exposure_nuc_eqtlGen <- read_exposure_data(
  filename = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt",
  sep = '\t',
  snp_col = 'SNP',
  beta_col = 'Beta',
  se_col = 'SE',
  effect_allele_col = 'AssessedAllele',
  #phenotype_col = 'tissue',
  #units_col = 'tissue',
  other_allele_col = 'OtherAllele',
  eaf_col = 'AlleleB_all',
  #samplesize_col = 'samplesize',
  #ncase_col = 'ncase',
  #ncontrol_col = 'ncontrol',
  gene_col = 'GeneSymbol',
  pval_col = 'Pvalue', 
  id_col = 'NUC eqtlGEN'
)

exposure_nuc_eqtlGen_p=exposure_nuc_eqtlGen[which(exposure_nuc_eqtlGen$pval.exposure<0.000005),]
write.csv(exposure_nuc_eqtlGen_p, 'exposure_nuc_eqtlGen_p.csv')

Clean the BCAC data

I’ve kept the BCAC data in the ldsr folder since copying the GWAS files to each new folder for each new project will result in over-reaching the storage capacity I’m allotted on Odyssey.

#####################################################################################################
# Read-in the BCAC data
# I had compressed the file before transfering to the cluster: 
# gzip icogs_onco_gwas_meta_overall_breast_cancer_summary_level_statistics.txt
# An Excel data dictionary accompanies the BCAC data
# /n/home04/cdadams/ld/ldsc//data_dictionary_summary_stats_bc_risk_2020.xlsx

#Data dictiony
# iCOGS-Oncoarray-GWAS-Meta-Overall-summary-results 
# Label Meaning
# var_name  The variant name is coded as chromosome_position_non-effect-allele_effect-allele
# Effect.Gwas   The effect allele in GWAS
# Baseline.Gwas The non effect allele in GWAS
# Freq.Gwas Effect allele frequency
# beta.Gwas Log odds ratio (OR) for effect alllele in GWAS data
# SE.Gwas   standard error of the log OR estimate in GWAS data
# P.value.Gwas  P-value of the log OR estimate in GWAS data
# SNP.iCOGS SNPs name in iCOGS panel. For SNPs with rsid, the name is coded as rsid:position:non-effect-allele:effect-allele; 
# For SNPs without rsid, the name is similarly as var_name 

bcac=read.table(gzfile("/n/home04/cdadams/ld/ldsc/icogs_onco_gwas_meta_overall_breast_cancer_summary_level_statistics.txt.gz"), header=TRUE)
colnames(bcac)
bcac$id=gsub( ":.*$", "", bcac$SNP.iCOGs)
bcac2=bcac[which(startsWith(bcac$id, 'rs')),]   
colnames(bcac2)[5]="Beta"

#save.image("longevity_bcac.Rdata")
write.table(bcac2, file = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote=FALSE)
# Read-in the bcac file formated for ldsc
bcac3=read.table(file = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt", sep = "\t", header=TRUE)

Harmonize the SNPs

exposure_nuc_eqtlGen_p=read.csv('/n/home04/cdadams/MR-eqtlGen-long/exposure_nuc_eqtlGen_p.csv')

bc_outcome <- read_outcome_data(
  snps = exposure_nuc_eqtlGen_p$SNP,
  filename = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt",
  sep = '\t',
  snp_col = 'id',
  beta_col = 'Beta',
  se_col = 'SE.Gwas',
  effect_allele_col = 'Effect.Gwas',
  #phenotype_col = 'tissue',#tissue
  #units_col = 'tissue',
  other_allele_col = 'Baseline.Gwas',
  eaf_col = 'Freq.Gwas',
  #samplesize_col = 'samplesize',
  #ncase_col = '133384',
  #ncontrol_col = '113789',
  #gene_col = 'gene_name',
  pval_col = 'P.value.Gwas', 
  id_col = 'breast cancer'
)

#Harmonize the alleles btw the two gwas
nuc_eqtlGen_bc <- harmonise_data(exposure_nuc_eqtlGen_p, bc_outcome, action = 2)
nuc_eqtlGen_bc$samplesize.outcome="247173"
nuc_eqtlGen_bc$samplesize.exposure="30596"
nuc_eqtlGen_bc$id.exposure=nuc_eqtlGen_bc$gene.exposure

write.csv(nuc_eqtlGen_bc, '/n/home04/cdadams/MR-eqtlGen-bc/nuc_eqtlGen_bc.csv') #can simply read this in for the MR

Split, loop, MR, & FDR-correct results

setwd('/n/home04/cdadams/MR-eqtlGen-bc')
nuc_eqtlGen_bc=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/nuc_eqtlGen_bc.csv')

# Command to remove everything except "v"
#rm(list=(ls()[ls()!="v"]))

# Split the harmonized data into separate data frames for each genes
nuc_split <- split(nuc_eqtlGen_bc, nuc_eqtlGen_bc$id.exposure)
list2env(nuc_split, envir= .GlobalEnv) #split the list into separate datasets

# Remove the original nuc_eqtlGen_bc and nuc_split
rm(list= ls()[(ls() %in% c("nuc_eqtlGen_bc","nuc_split") )])

# Save each harmonized gene as a .csv
# for (i in 1:length(files)){
#   write.csv(files[[i]], paste(names(files[i]), ".csv", sep = ""))
# }

# Get them back together: not used here but useful
# dfs = sapply(.GlobalEnv, is.data.frame) 
# dfs
# 
# df=do.call(rbind, mget(names(dfs)[dfs]))
# head(df)
# dim(df)

# Combine the separated files into a list
files <- mget(ls())
length(files)

# Use a loop to clump each NUC dataframe in the list, run MR, & save results
for (i in 1:length(files)){
   files[[i]]=clump_data(files[[i]])
   write.csv(files[[i]], paste(names(files[i]), "_clump3.csv", sep = ""))
   files[[i]]=mr(files[[i]])
   write.csv(files[[i]], paste(names(files[i]), "_mr.csv", sep = ""))
}

# Idea for reading all mr results in and saving:
# Idea from: https://statisticsglobe.com/merge-csv-files-in-r
# Sadly, didn't work. 
# data_all <- list.files(path = "/n/home04/cdadams/MR-eqtlGen-bc",  
#   pattern = "*_mr.csv", full.names = TRUE) %>% 
#   lapply(read_csv) %>%                                                    
#   bind_rows                                                                
#   write.csv(data_all, "complete_NUC_MR_res.csv")

# Merged all of the clumped data 
tbl_fread_clump <- 
    list.files(pattern = "*_clump3.csv") %>% 
    map_df(~fread(.))
tbl_fread_clump_df=as.data.frame(tbl_fread_clump)
head(tbl_fread_clump_df)
colnames(tbl_fread_clump_df)
df <- tbl_fread_clump_df[c(3:34)] #check this
write.csv(df, '/n/home04/cdadams/MR-eqtlGen-bc/df.csv')

# Merge all the MR results and save 
tbl_fread <- 
    list.files(pattern = "*_mr.csv") %>% 
    map_df(~fread(.))

tbl_fread_df=as.data.frame(tbl_fread)
myvars=c('id.exposure','id.outcome','method', 'nsnp','b','se','pval')
res=tbl_fread_df[myvars]  
res$id.outcome="bc"
res=res[order(res$p),]

# FDR correction
p=res$p
fdr=p.adjust(p, method = "fdr", n = length(p))
res$fdr=fdr
res=res[order(res$fdr),]
write.csv(res,'/n/home04/cdadams/MR-eqtlGen-bc/MR-eqtlGEN-NUC-bc.csv')
res=res[order(res$p),]

# Select just the set with >2 SNPs
res=res[order(res$fdr),]
res_meta_set=res[which(res$nsnp>2),]
res_meta_set=res_meta_set[which(res_meta_set$method=='Inverse variance weighted'),]

# FDR for res_meta_set (only IVWs)
p2=res_meta_set$p
fdr2=p.adjust(p2, method = "fdr", n = length(p2))
res_meta_set$fdr2=fdr2
res_meta_set$fdr=NULL
res_meta_set=res_meta_set[order(res_meta_set$fdr2),]

write.csv(res_meta_set,'/n/home04/cdadams/MR-eqtlGen-bc/MR-eqtlGEN-NUC-bc-meta.csv')

# Consideration for later: Find way to merge the df and results

Results

Top results ordered by p-value (not FDR)

This includes a list of all tests run

  • includes 1 SNP tests (Wald ratios)
  • inverse variance weighted (IVW) – main meta-analytic test
  • sensitivity estimators: weighted median, weighted mode, and MR-Egger
res=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/MR-eqtlGEN-NUC-bc.csv')
head(res, n=5)
##     X id.exposure id.outcome                    method nsnp          b
## 1 360      MRPS30         bc                Wald ratio    1  0.9559052
## 2  44        DAP3         bc                Wald ratio    1 -0.2849066
## 3 247     MPV17L2         bc Inverse variance weighted    2  0.3108872
## 4 628       RPP30         bc                Wald ratio    1 -0.7646673
## 5 747       UTP15         bc                Wald ratio    1  0.3270660
##          se        pval       fdr
## 1 0.2494272 0.000126898 0.1022798
## 2 0.0931892 0.002233468 0.9000878
## 3 0.1164038 0.007567856 0.9849350
## 4 0.2897187 0.008306658 0.9849350
## 5 0.1249164 0.008837437 0.9849350

Top IVW results by FDR

res_meta_set=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/MR-eqtlGEN-NUC-bc-meta.csv')
head(res_meta_set, n=5)
##     X id.exposure id.outcome                    method nsnp          b
## 1 489        PELO         bc Inverse variance weighted    4 -0.1426427
## 2 571      RPL13A         bc Inverse variance weighted    4 -0.1652277
## 3 172       GTF3A         bc Inverse variance weighted    5  0.1345609
## 4 459       NSUN4         bc Inverse variance weighted    5  0.1106715
## 5 167      GTF2H5         bc Inverse variance weighted    6  0.1550646
##           se       pval      fdr2
## 1 0.05614872 0.01107119 0.5093042
## 2 0.06542380 0.01155344 0.5093042
## 3 0.05835549 0.02111721 0.5093042
## 4 0.04836557 0.02212418 0.5093042
## 5 0.06849377 0.02357890 0.5093042

Wald ratios (all unclumped SNPs)

setwd('/n/home04/cdadams/MR-eqtlGen-bc')
nuc_eqtlGen_bc=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/nuc_eqtlGen_bc.csv')

# Split the harmonized data into separate data frames for each genes
nuc_split <- split(nuc_eqtlGen_bc, nuc_eqtlGen_bc$id.exposure)
list2env(nuc_split, envir= .GlobalEnv) #split the list into separate datasets

# Remove the original nuc_eqtlGen_bc and nuc_split
rm(list= ls()[(ls() %in% c("nuc_eqtlGen_bc","nuc_split") )])

# Combine the separated files into a list
files <- mget(ls())
length(files)

# Use a loop to run MR & save results

for (i in 1:length(files)){
   files[[i]]=mr_singlesnp(files[[i]], parameters = default_parameters(), single_method = "mr_wald_ratio")
   write.csv(files[[i]], paste(names(files[i]), "_Walds_mr.csv", sep = ""))
}

# Merge all the MR results and save 
Walds_res_tbl_fread <- 
    list.files(pattern = "*_Walds_mr.csv") %>% 
    map_df(~fread(.))

Walds_res_tbl_fread_df=as.data.frame(Walds_res_tbl_fread)
myvars=c('id.exposure','id.outcome', 'SNP','b','se','p')
Walds_res=Walds_res_tbl_fread_df[myvars]  
Walds_res$id.outcome="bc"
Walds_res=Walds_res[order(Walds_res$p),]
write.csv(Walds_res,"Walds_includes_metas_res.csv")

# Tried  selecting "rs", but the "inverse variance weighted" includes "rs"
exp1 <- Walds_res[Walds_res$SNP %like% "rs", ]  

Walds_res2=Walds_res %>% 
  filter(str_detect(Walds_res$SNP, "^rs"))

# FDR correction
p=Walds_res2$p
fdr=p.adjust(p, method = "fdr", n = length(p))
Walds_res2$fdr=fdr
Walds_res2=Walds_res2[order(Walds_res2$fdr),]
write.csv(Walds_res2,'/n/home04/cdadams/MR-eqtlGen-bc/Walds_MR_eqtlGEN_NUC_bc.csv')

Top Wald-ratio results by FDR

Walds_res=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/Walds_MR_eqtlGEN_NUC_bc.csv')
head(Walds_res, n=5)
##   X id.exposure id.outcome        SNP         b        se            p
## 1 1      MRPL21         bc rs17136641 -3.687379 0.4660907 2.547347e-15
## 2 2      MRPS30         bc rs10941679  1.797656 0.3311905 5.703353e-08
## 3 3      MRPL23         bc  rs2271439 -1.914985 0.3971946 1.426424e-06
## 4 4      MRPL23         bc rs11041615 -1.890211 0.3952851 1.736521e-06
## 5 5      MRPS30         bc  rs6451770  1.285949 0.2708041 2.047953e-06
##            fdr
## 1 4.324172e-10
## 2 4.840777e-03
## 3 6.952882e-02
## 4 6.952882e-02
## 5 6.952882e-02

Output (eQTLGen on BC)

My Rpubs