I have a separate bash script: MR-eqtlGen-long.sh that I’m tracking. I have not yet decided what I prefer: - integrating bash into .Rmd, or - having it separate as its own tracked file - using the ‘eval=FALSE’ option lets me store & render bash commands without running them when the .Rmd runs
# Create directory
mkdir /n/home04/cdadams/MR-eqtlGen-long
cd /n/home04/cdadams/MR-eqtlGen-long
# Listing by file type
ls *.Rmd | head -n 5
ls *.csv | head -n 5
# Copy combo4 for RP/NUC/ribiogenesis gene set
cd /n/home04/cdadams/ld/ldsc/
cp combo4.csv /n/home04/cdadams/MR-eqtlGen-long
# Copy the eqtlGEN GWAS /n/home04/cdadams/MR-eqtlGen-long
cd /n/home04/cdadams/ld/ldsc/
cp 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt /n/home04/cdadams/MR-eqtlGen-long
# Copy MAF files/n/home04/cdadams/MR-eqtlGen-long
cd /n/home04/cdadams/coloc/ch16
cp 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz /n/home04/cdadams/MR-eqtlGen-long
# Copy longevity GWAS
cp lifegen_phase2_bothpl_alldr_2017_09_18.tsv.gz /n/home04/cdadams/MR-eqtlGen-long #original file
cp long_for_mr.txt /n/home04/cdadams/MR-eqtlGen-long #just unzipped
# Got message saying ran out of disk space
#so I looked to see whether the full document saved. It didn't! So, I used FileZilla to delete files I'm not currently using
#to free up cluster storage space. I need to have more conversations with the FAS IT Help about how to get the right configuations
#for storing & writing so many big files.
cd /n/home04/cdadams/MR-eqtlGen-long
ls
wc -l nucs_total_maf_beta_se.txt
\[ \beta = \frac{z} {\sqrt2p(1-p)(n + z^2)} \] \[ SE = \frac{1} {\sqrt2p(1-p)(n + z^2)} \] where \[p = MAF\]
# README files for the eQTLGen GWAS
# This README accompanies the files with cis-eQTL results from eQTLGen
# Files
# -----
# File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
# Column Names
# ------------
# Pvalue - P-value
# SNP - SNP rs ID
# SNPChr - SNP chromosome
# SNPPos - SNP position
# AssessedAllele - Assessed allele, the Z-score refers to this allele
# OtherAllele - Not assessed allele
# Zscore - Z-score
# Gene - ENSG name (Ensembl v71) of the eQTL gene
# GeneSymbol - HGNC name of the gene
# GeneChr - Gene chromosome
# GenePos - Centre of gene position
# NrCohorts - Total number of cohorts where this SNP-gene combination was tested
# NrSamples - Total number of samples where this SNP-gene combination was tested
# FDR - False discovery rate estimated based on permutations
# BonferroniP - P-value after Bonferroni correction
#
# Additional information
# ----------------------
# These files contain all cis-eQTL results from eQTLGen, accompanying the article.
# 19,250 genes that showed expression in blood were tested.
# Every SNP-gene combination with a distance <1Mb from the center of the gene and tested in at least 2 cohorts was included.
# Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.
# UPDATE LOG
# ----------
# 2018-10-19: Initial data release
# 2018-12-19: In the current README, following file names have been fixed and updated:
# 2019-12-20: Cis-eQTLs are now updated to have a 2-cohort filter: every cis-eQTL must be tested in at least 2 cohorts to be reported
# On cluster, I gzipped the eqtlGen file: gzip 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt
# this is because I kept getting error messages both here & on the cluster saying ther was no room on the disk.
# I moved unnecessary files to a folder in my personal computer DB: /Users/charleenadams/Dropbox/Harvard/longevity/survival/odyssey_transfer/
# This has the gz files for the protein GWAS, the eQTLGen SMR file, and the eQTLGen EAF file.
# I launched an interacted terminal session with increased memory:
# salloc -p test --mem 5000 -t 0-06:00
#-----------------------------------------------------------------------------------------------------------------------
# README file for the MAF eQTLGen cis-eQTL data
# A README accompanies the file with allele frequencies from eQTLGen cis-eQTL analysis:
# 2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz
# Column Names
# ------------
# SNP - SNP rs ID
# hg19_chr - chr number
# hg19_pos - SNP position (hg19)
# AlleleA - Other allele
# AlleleB - Assessed allele
# allA_total - Total allele count of genotype AA
# allAB_total - Total allele count of genotype AB
# allB_total - Total allele count of genotype BB
# AlleleB_all - Allele frequency of assessed allele
# Additional information
# ----------------------
# The allele frequencies were calculated using reported allele counts from all cohorts except
# Framingham Heart Study, because the related samples are present in this cohort.
#-----------------------------------------------------------------------------------------------------------------------
setwd('/n/home04/cdadams/MR-eqtlGen-long')
combo4=read.csv('/n/home04/cdadams/MR-eqtlGen-long/combo4.csv')
tbl=fread('cat 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz | gunzip')
# Extract NUCs
nucs=tbl[which(tbl$GeneSymbol %in% combo4$gene),]
head(tbl$Gene)
# Read-in MAFs
eqtl_MAF=read.table(file = "2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt",
sep = "\t", header=TRUE)
nucs_total <- merge(nucs,eqtl_MAF,by="SNP")
# Calculate betas and SEs
n=30596
nucs_total$Beta=nucs_total$Zscore/sqrt(2*(nucs_total$AlleleB_all)*
(1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
nucs_total$SE=1/sqrt(2*(nucs_total$AlleleB_all)*
(1-nucs_total$AlleleB_all)*(n+nucs_total$Zscore^2))
write.table(nucs_total, file = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt", sep = "\t",
row.names = FALSE, col.names = TRUE, quote=FALSE)
# gznucs <- gzfile("/n/home04/cdadams/MR-eqtlGen-long/nucs.gz", "w")
# write.csv(nucs, gznucs)
# close(gznucs)
# Read-in for MR formatting
exposure_nuc_eqtlGen <- read_exposure_data(
filename = "/n/home04/cdadams/MR-eqtlGen-long/nucs_total_maf_beta_se.txt",
sep = '\t',
snp_col = 'SNP',
beta_col = 'Beta',
se_col = 'SE',
effect_allele_col = 'AssessedAllele',
#phenotype_col = 'tissue',
#units_col = 'tissue',
other_allele_col = 'OtherAllele',
eaf_col = 'AlleleB_all',
#samplesize_col = 'samplesize',
#ncase_col = 'ncase',
#ncontrol_col = 'ncontrol',
gene_col = 'GeneSymbol',
pval_col = 'Pvalue',
id_col = 'NUC eqtlGEN'
)
exposure_nuc_eqtlGen_p=exposure_nuc_eqtlGen[which(exposure_nuc_eqtlGen$pval.exposure<0.000005),]
write.csv(exposure_nuc_eqtlGen_p, 'exposure_nuc_eqtlGen_p.csv')
exposure_nuc_eqtlGen_p=read.csv('exposure_nuc_eqtlGen_p.csv')
long_outcome <- read_outcome_data(
snps = exposure_nuc_eqtlGen_p$SNP,
filename = "/n/home04/cdadams/MR-eqtlGen-long/long_for_mr.txt",
sep = '\t',
snp_col = 'rsid',
beta_col = 'beta1',
se_col = 'se',
effect_allele_col = 'a1',
#phenotype_col = 'tissue',#tissue
#units_col = 'tissue',
other_allele_col = 'a0',
eaf_col = 'freq1',
#samplesize_col = 'samplesize',
#ncase_col = 'ncase',
#ncontrol_col = 'ncontrol',
gene_col = 'gene_name',
pval_col = 'p',
id_col = 'longevity'
)
#Harmonize the alleles btw the two gwas
nuc_eqtlGen_long <- harmonise_data(exposure_nuc_eqtlGen_p, long_outcome , action = 2)
nuc_eqtlGen_long$samplesize.outcome="1012240"
nuc_eqtlGen_long$samplesize.exposure="30596"
nuc_eqtlGen_long$id.exposure=nuc_eqtlGen_long$gene.exposure
head(nuc_eqtlGen_long)
table(nuc_eqtlGen_long$gene.exposure)
write.csv(nuc_eqtlGen_long, 'nuc_eqtlGen_long.csv') #can simply read this in for the MR
nuc_eqtlGen_long=read.csv('nuc_eqtlGen_long.csv')
# Split the harmonized data into separate data frames for each genes
nuc_split <- split(nuc_eqtlGen_long, nuc_eqtlGen_long$id.exposure)
list2env(nuc_split, envir= .GlobalEnv) #split the list into separate datasets
# Remove the original nuc_eqtlGen_long and nuc_split
rm(list= ls()[(ls() %in% c("nuc_eqtlGen_long","nuc_split") )])
# Save each harmonized gene as a .csv
for (i in 1:length(files)){
write.csv(files[[i]], paste(names(files[i]), ".csv", sep = ""))
}
# Get them back together: not used here but useful
# dfs = sapply(.GlobalEnv, is.data.frame)
# dfs
#
# df=do.call(rbind, mget(names(dfs)[dfs]))
# head(df)
# dim(df)
# Combine the separated files into a list
files <- mget(ls())
length(files)
# Use a loop to clump each NUC dataframe in the list, run MR, & save results
for (i in 1:length(files)){
files[[i]]=clump_data(files[[i]])
write.csv(files[[i]], paste(names(files[i]), "_clump3.csv", sep = ""))
files[[i]]=mr(files[[i]])
write.csv(files[[i]], paste(names(files[i]), "_mr.csv", sep = ""))
}
# Idea for reading all mr results in and saving:
# Idea from: https://statisticsglobe.com/merge-csv-files-in-r
# Sadly, didn't work.
# data_all <- list.files(path = "/n/home04/cdadams/MR-eqtlGen-long",
# pattern = "*_mr.csv", full.names = TRUE) %>%
# lapply(read_csv) %>%
# bind_rows
# write.csv(data_all, "complete_NUC_MR_res.csv")
# Command to remove everything except "v"
#rm(list=(ls()[ls()!="v"]))
# Merge all the MR results and save
tbl_fread <-
list.files(pattern = "*_mr.csv") %>%
map_df(~fread(.))
tbl_fread_df=as.data.frame(tbl_fread)
myvars=c('id.exposure','id.outcome','method', 'nsnp','b','se','pval')
res=tbl_fread_df[myvars]
res$id.outcome="longevity"
res=res[order(res$p),]
# FDR correction
p=res$p
fdr=p.adjust(p, method = "fdr", n = length(p))
res$fdr=fdr
res=res[order(res$fdr),]
write.csv(res,'MR_eqtlGEN_NUC_long.csv')
# Select just the set with >2 SNPs
res=res[order(res$fdr),]
res_meta_set=res[which(res$nsnp>2),]
res_meta_set=res_meta_set[which(res_meta_set$method=='Inverse variance weighted'),]
# FDR for res_meta_set (only 110 IVWs)
p2=res_meta_set$p
fdr2=p.adjust(p2, method = "fdr", n = length(p2))
res_meta_set$fdr2=fdr2
res_meta_set$fdr=NULL
res_meta_set=res_meta_set[order(res_meta_set$fdr2),]
write.csv(res_meta_set,'MR_eqtlGEN_NUC_long_meta_set.csv')