Format eqtlGen GWAS

  • eqtlGen stored MAFs in a separate file.
  • MAFs are needed for the calculation of the betas & SEs from the Z-scores.
  • Methods for this are provided by: Zhu 2016: https://www.nature.com/articles/ng.3538

\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]

# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
 
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
 
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
# 
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and  tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported

# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/bc/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory: 
# salloc -p test  --mem 5000 -t 0-06:00

#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data 

# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis: 
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz

# Column Names
# ------------
# SNP -         SNP rs ID
# hg19_chr -    chr number
# hg19_pos -    SNP position (hg19)
# AlleleA -     Other allele
# AlleleB -     Assessed allele
# allA_total -  Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total -  Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele

# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except 
# Framingham Heart Study, because the related samples are present in this cohort.

#-----------------------------------------------------------------------------------------------------------------------
# Remember that the eqtlGen file is stored in /n/home04/cdadams/MR-eqtlGen-long
# Don't move to MR-eqtlGen-bc because it will result a storage over-capacity

setwd('/n/home04/cdadams/MR-eqtlGen-long/')
combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-bc/combo4.csv')

tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')

# Extract NUCs
nucs=tbl[which(tbl$GeneSymbol %in% combo4$gene),]
head(tbl$Gene)

# Read-in MAFs
eqtl_MAF=read.table(file = "2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt", 
                    sep = "\t", header=TRUE)
nucs_total <- merge(nucs,eqtl_MAF,by="SNP")

# Calculate betas and SEs
n=30596
nucs_total$Beta=nucs_total$Zscore/sqrt(2*(nucs_total$AlleleB_all)*
                  (1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
nucs_total$SE=1/sqrt(2*(nucs_total$AlleleB_all)*
                  (1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))

write.table(nucs_total, file = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote=FALSE)

# gznucs <- gzfile("/n/home04/cdadams/MR-eqtlGen-bc/nucs.gz", "w")
# write.csv(nucs, gznucs)
# close(gznucs)

# Read-in for MR formatting
exposure_nuc_eqtlGen <- read_exposure_data(
  filename = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt",
  sep = '\t',
  snp_col = 'SNP',
  beta_col = 'Beta',
  se_col = 'SE',
  effect_allele_col = 'AssessedAllele',
  #phenotype_col = 'tissue',
  units_col = 'SNPChr',
  other_allele_col = 'OtherAllele',
  eaf_col = 'AlleleB_all',
  #chr='SNPChr',
  #position='SNPPos',
  samplesize_col = 'SNPPos',
  #ncase_col = 'ncase',
  #ncontrol_col = 'ncontrol',
  gene_col = 'GeneSymbol',
  pval_col = 'Pvalue', 
  id_col = 'NUC eqtlGEN'
)

# Chromosome and position are needed for the circos plot
# I couldn't use the commands for chr and position (they appear defunct), so I had to use the command for units for chr and samplesize for position. I re-saved it with _pos_chr at the end

exposure_nuc_eqtlGen_p=exposure_nuc_eqtlGen[which(exposure_nuc_eqtlGen$pval.exposure<0.000005),]
write.csv(exposure_nuc_eqtlGen_p, '/n/home04/cdadams/Circos-eqtlGen-bc/exposure_nuc_eqtlGen_p_pos_chr.csv')

Clean the BCAC data

I’ve kept the BCAC data in the ldsr folder since copying the GWAS files to each new folder for each new project will result in over-reaching the storage capacity I’m allotted on Odyssey.

#####################################################################################################
# Read-in the BCAC data
# I had compressed the file before transfering to the cluster: 
# gzip icogs_onco_gwas_meta_overall_breast_cancer_summary_level_statistics.txt
# An Excel data dictionary accompanies the BCAC data
# /n/home04/cdadams/ld/ldsc//data_dictionary_summary_stats_bc_risk_2020.xlsx

#Data dictiony
# iCOGS-Oncoarray-GWAS-Meta-Overall-summary-results 
# Label Meaning
# var_name  The variant name is coded as chromosome_position_non-effect-allele_effect-allele
# Effect.Gwas   The effect allele in GWAS
# Baseline.Gwas The non effect allele in GWAS
# Freq.Gwas Effect allele frequency
# beta.Gwas Log odds ratio (OR) for effect alllele in GWAS data
# SE.Gwas   standard error of the log OR estimate in GWAS data
# P.value.Gwas  P-value of the log OR estimate in GWAS data
# SNP.iCOGS SNPs name in iCOGS panel. For SNPs with rsid, the name is coded as rsid:position:non-effect-allele:effect-allele; 
# For SNPs without rsid, the name is similarly as var_name 

bcac=read.table(gzfile("/n/home04/cdadams/ld/ldsc/icogs_onco_gwas_meta_overall_breast_cancer_summary_level_statistics.txt.gz"), header=TRUE)
colnames(bcac)
bcac$id=gsub( ":.*$", "", bcac$SNP.iCOGs)
bcac2=bcac[which(startsWith(bcac$id, 'rs')),]   
colnames(bcac2)[5]="Beta"

#save.image("longevity_bcac.Rdata")
write.table(bcac2, file = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote=FALSE)
# Read-in the bcac file formated for ldsc
bcac3=read.table(file = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt", sep = "\t", header=TRUE)

Harmonize the SNPs

setwd('/n/home04/cdadams/Circos-eqtlGen-bc')
exposure_nuc_eqtlGen_p=read.csv('/n/home04/cdadams/Circos-eqtlGen-bc/exposure_nuc_eqtlGen_p_pos_chr.csv')

# Read-in the bcac file formated for ldsc
# bcac3=read.table(file = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt", sep = "\t", header=TRUE)
# bcac3$chr=sub("\\_.*", "", bcac3$var_name)
# bcac3$position=bcac3$Position.Onco
# write.table(bcac3, file = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt", sep = "\t",
#             row.names = FALSE, col.names = TRUE, quote=FALSE)

bc_outcome <- read_outcome_data(
  snps = exposure_nuc_eqtlGen_p$SNP,
  filename = "/n/home04/cdadams/ld/ldsc/bcac_master3.txt",
  sep = '\t',
  snp_col = 'id',
  beta_col = 'Beta',
  se_col = 'SE.Gwas',
  effect_allele_col = 'Effect.Gwas',
  #phenotype_col = 'tissue',#tissue
  units_col = 'chr',
  other_allele_col = 'Baseline.Gwas',
  eaf_col = 'Freq.Gwas',
  samplesize_col = 'Position.Onco',
  #ncase_col = '133384',
  #ncontrol_col = '113789',
  #gene_col = 'gene_name',
  pval_col = 'P.value.Gwas', 
  id_col = 'breast cancer'
)

#Harmonize the alleles btw the two gwas
nuc_eqtlGen_bc <- harmonise_data(exposure_nuc_eqtlGen_p, bc_outcome, action = 2)
nuc_eqtlGen_bc$chr=nuc_eqtlGen_bc$units.exposure
nuc_eqtlGen_bc$position=nuc_eqtlGen_bc$samplesize.exposure
nuc_eqtlGen_bc$samplesize.outcome="247173"
nuc_eqtlGen_bc$samplesize.exposure="30596"
nuc_eqtlGen_bc$id.exposure=nuc_eqtlGen_bc$gene.exposure

write.csv(nuc_eqtlGen_bc, '/n/home04/cdadams/Circos-eqtlGen-bc/nuc_eqtlGen_bc_chr_pos.csv') #can simply read this in for the MR

Circos plot of the RPs in the eqtlGen-BCAC data

RP_circos=read.csv('/n/home04/cdadams/Circos-eqtlGen-bc/nuc_eqtlGen_bc_chr_pos.csv')
colnames(RP_circos)

# Get just the RPs (versus the NUCs)
rp_list_expanded=read.csv('/n/home04/cdadams/ld/ldsc/expanded_verified.csv')

RP_circos_RP=RP_circos[which(RP_circos$gene.exposure%in% rp_list_expanded$all_RP),]

# Select just the variables needed for the circos plot
myvars_pretty_plots=c("SNP","chr", "position", "beta.exposure", "pval.exposure")

pretty_plots=RP_circos_RP[myvars_pretty_plots]
pretty_plots$pval=-log10(pretty_plots$pval)
pretty_plots$pval.exposure <- NULL

write.csv(pretty_plots, "pretty_plots_RP.csv")

# Added a dummy row for chr16 in Excel with the positive version of the largest negative value
# This step is necessary to get the effect estiamte scale 
pretty_plots_dummy=read.csv("pretty_plots_RP_dummy.csv")
pretty_plots_dummy$X=NULL

CMplot(pretty_plots_dummy,type="p",plot.type="c",cir.legend=TRUE,
        outward=FALSE,cir.legend.col="black",cir.chr.h=1.3,chr.den.col="black",file="jpg",
        memo="",dpi=300,file.output=TRUE,verbose=TRUE,width=10,height=10, LOG10=F)

Descriptive statistics

RP_chi_square=read.csv('/n/home04/cdadams/Circos-eqtlGen-bc/nuc_eqtlGen_bc_chr_pos.csv')

# Get just the RPs (versus the NUCs)
rp_list_expanded=read.csv('/n/home04/cdadams/ld/ldsc/expanded_verified.csv')

RP_chi_square_RP=RP_chi_square[which(RP_chi_square$gene.exposure%in% rp_list_expanded$all_RP),]
cyto=rp_list_expanded$all_RP[78:150]
mito=rp_list_expanded$all_RP[1:77]

# Get the range of SNP-gene pairs for the RPs
genes=table(RP_chi_square_RP$gene.exposure)
genes=as.data.frame(genes)
genes=genes[order(genes$Freq),]
head(genes)
##       Var1 Freq
## 107   RPS2    1
## 114 RPS27A    2
## 81  RPL23A    3
## 89  RPL35A    3
## 32  MRPL44    4
## 45  MRPS11    4
#rps23=RP_chi_square_RP[which(RP_chi_square_RP$gene.exposure=="RPS23"),]

# Chi-squared of direction and where: N=51915
RP_chi_square_RP$direction=ifelse(RP_chi_square_RP$beta.exposure<0,"Decreased", "Increased")
table(RP_chi_square_RP$direction)
## 
## Decreased Increased 
##     24456     28459
RP_chi_square_RP$where=ifelse(startsWith(RP_chi_square_RP$gene.exposure, 'MRPL'), 'mitochondria', 'cytosol')
RP_chi_square_RP$where=ifelse(startsWith(RP_chi_square_RP$gene.exposure, 'MRPS'), 'mitochondria', RP_chi_square_RP$where)
RP_chi_square_RP$where=ifelse(startsWith(RP_chi_square_RP$gene.exposure, 'RPL'), 'cytosol', RP_chi_square_RP$where)
RP_chi_square_RP$where=ifelse(startsWith(RP_chi_square_RP$gene.exposure, 'RPS'), 'cytosol', RP_chi_square_RP$where)

RP_where_direction_sum=addmargins(table(RP_chi_square_RP$where, RP_chi_square_RP$direction),2)
RP_where_direction_sum
##               
##                Decreased Increased   Sum
##   cytosol           9522     11883 21405
##   mitochondria     14934     16576 31510
RP_where_direction=table(RP_chi_square_RP$where, RP_chi_square_RP$direction)
RP_where_direction
##               
##                Decreased Increased
##   cytosol           9522     11883
##   mitochondria     14934     16576
RP_where_direction_chisq=chisq.test(table(RP_chi_square_RP$where, RP_chi_square_RP$direction))
RP_where_direction_chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(RP_chi_square_RP$where, RP_chi_square_RP$direction)
## X-squared = 43.293, df = 1, p-value = 4.713e-11
RP_where_direction_chisq[3]#4.713176e-11
## $p.value
## [1] 4.713176e-11
round(RP_where_direction_chisq$stdres,2)
##               
##                Decreased Increased
##   cytosol          -6.59      6.59
##   mitochondria      6.59     -6.59
# ftable(RP_where_direction)
# summary(RP_where_direction)