With: relaxed pval (5E-6) and clump = 500
rm(list = ls())
# Load Required Libraries
# Mendelian Randomization
library(TwoSampleMR) # Core package for Mendelian Randomization (MR) analyses
library(MendelianRandomization) # Additional MR methods, including MR-Lasso
library(ieugwasr) # For local LD clumping with ld_clump()
library(MRInstruments) # For proxy SNP lookup (if needed)
# Data Wrangling & Handling
library(dplyr) # Data manipulation (filtering, joining, summarizing)
library(tidyr) # Data reshaping
library(data.table) # Fast and efficient data handling
library(readxl) # Reading Excel files
library(openxlsx) # Writing and formatting Excel files efficiently
# Statistical & Visualization
library(ggplot2) # For plotting
library(ggrepel) # Improved text labeling in plots
library(corrplot) # Visualization of correlation matrices
library(RhpcBLASctl) # Control multithreading for efficiency in MR analyses
library(biomaRt) # Querying Ensembl for gene annotation
library(scales) # For dynamic color generation
# Set Working Directory & Setup Folder for Project
setwd("/Users/charleenadams/mr_nag_bmi/")
if (!dir.exists("results_finngen_p5E6")) dir.create("results_finngen_p5E6", recursive = TRUE)
# ---------------------------------------------
# Load and Format NAG Data
# ---------------------------------------------
nag <- fread("/Users/charleenadams/mr_nag_bmi/phenocode-C100001253.tsv") %>% as.data.frame()
filtered_nag_df <- nag %>%
filter(!is.na(rsids) & rsids != "" & grepl("^rs", rsids)) %>%
arrange(rsids)
exp_dat <- filtered_nag_df %>%
mutate(
chr.exposure = chrom,
pos.exposure = pos,
beta.exposure = beta,
se.exposure = sebeta,
exposure = "N-acetylglutamine",
id.exposure = "N-acetylglutamine",
pval.exposure = pval,
SNP.exposure = rsids,
SNP = rsids,
effect_allele.exposure = alt,
other_allele.exposure = ref,
eaf.exposure = maf,
samplesize.exposure = 5830,
id_col = nearest_genes
)
# ---------------------------------------------
# Load and Format BMI Data
# ---------------------------------------------
bmi <- fread("/Users/charleenadams/temp_BI/mr_nat_pter_bmi/summary_stats_release_finngen_R12_BMI_IRN")
filtered_bmi_df <- bmi %>%
filter(!is.na(rsids) & rsids != "" & grepl("^rs", rsids)) %>%
arrange(rsids) %>%
rename(CHR = `#chrom`)
out_dat <- filtered_bmi_df %>%
mutate(
SNP = rsids,
SNP.outcome = rsids,
chr.outcome = CHR,
pos.outcome = pos,
beta.outcome = beta,
se.outcome = sebeta,
pval.outcome = pval,
effect_allele.outcome = alt,
other_allele.outcome = ref,
eaf.outcome = af_alt,
samplesize.outcome = 500348,
id.outcome = "FinnGen BMI",
outcome = "FinnGen BMI",
id_col = "nearest_genes"
)
# ---------------------------------------------
# Harmonize Data
# ---------------------------------------------
dat <- harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat)
today <- Sys.Date()
write.csv(dat, paste0("/Users/charleenadams/mr_nag_bmi/harmonized_nag_finngen_BMI_dat_", today, ".csv"), row.names = FALSE)
# ---------------------------------------------
# Select 1MB around ACY1 region
# ---------------------------------------------
dat <- fread("/Users/charleenadams/mr_nag_bmi/harmonized_nag_finngen_BMI_dat_2025-04-02.csv")
# "/Users/charleenadams/mr_ukbppp_chd/mart_export_clean.txt" #build 38
# Load the necessary data (local_annotation)
local_annotation <- read.table("/Users/charleenadams/mr_ukbppp_chd/mart_export_clean.txt", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
# Filter for the gene ACY1
acy1_data <- local_annotation[local_annotation$Gene.name == "ACY1", ]
# Check the data for ACY1
head(acy1_data)
# Extract the TSS (Transcription Start Site)
# TSS is given as the start site of the transcript, use the `Transcript.start..bp.` column
acy1_tss <- min(acy1_data$Transcript.start..bp.)
# Define the 500kb window up and down from the TSS
upstream_window <- acy1_tss - 500000
downstream_window <- acy1_tss + 500000
# Display the results
cat("TSS for ACY1:", acy1_tss, "\n")
cat("500kb upstream:", upstream_window, "\n")
cat("500kb downstream:", downstream_window, "\n")
ACY1_chrom <- acy1_data$Chromosome.scaffold.name[1]
# Define the filtered dataset
dat_filtered <- dat %>%
filter(chr.outcome == ACY1_chrom & pos.outcome >= upstream_window & pos.outcome <= downstream_window)
# Create the filename with the current system date
file_name <- paste0("/Users/charleenadams/mr_nag_bmi/filtered_ACY1_1Mb_finngen_BMI_", Sys.Date(), ".csv")
# Save the filtered data to a CSV file with the system date in the filename
write.csv(dat_filtered, file = file_name, row.names = FALSE)
# ---------------------------------------------
# MR
# ---------------------------------------------
# Step 1: Filter Instruments
dat_filtered <- read.csv("/Users/charleenadams/mr_nag_bmi/filtered_ACY1_1Mb_finngen_BMI_2025-04-02.csv")
instruments <- dat_filtered %>%
filter(mr_keep == TRUE, pval.exposure < 5e-6) %>% # Relaxed threshold
mutate(
rsid = SNP,
pval = pval.exposure,
id = "N-acetylguanine (1MB around ACY1 TSS)",
F_stat = (beta.exposure / se.exposure)^2 # Compute F-statistic
) %>%
filter(F_stat > 10) # Exclude weak instruments
cat("Number of instruments after F-stat filtering:", nrow(instruments), "\n")
# Step 1.5: Steiger Filtering
instruments <- steiger_filtering(instruments)
instruments <- instruments %>% filter(steiger_dir == TRUE) # Keep only SNPs with correct direction
cat("Number of instruments after Steiger filtering:", nrow(instruments), "\n")
cat("Preview of Steiger-filtered instruments:\n")
print(head(instruments))
# Subset instruments to keep only relevant fields
instruments_subset <- instruments %>%
dplyr::select(
SNP,
chr.exposure,chr.outcome,pos.exposure,pos.outcome,
effect_allele.exposure, other_allele.exposure,
effect_allele.outcome, other_allele.outcome,
beta.exposure, se.exposure, pval.exposure,
beta.outcome, se.outcome, pval.outcome,
eaf.exposure, eaf.outcome,
id.exposure, exposure,
id.outcome, outcome,
samplesize.exposure, samplesize.outcome,
mr_keep, action,
F_stat,
steiger_dir, steiger_pval
)
instruments_subset$rsid <- instruments_subset$SNP
# Step 2: Local LD Clumping
clumped <- ld_clump(
dplyr::tibble(rsid = instruments_subset$rsid, pval = instruments_subset$pval, id = instruments_subset$id),
plink_bin = genetics.binaRies::get_plink_binary(),
bfile = "/Users/charleenadams/1000G_bfiles/EUR/EUR",
clump_r2 = 0.001,
clump_kb = 500 # Reduced window
)
clumped_dat <- instruments_subset %>% dplyr::filter(SNP %in% clumped$rsid)
cat("Local clumping completed. Number of SNPs retained:", nrow(clumped_dat), "\n")
cat("Preview of clumped data:\n")
print(head(clumped_dat))
# Subset clumped_dat to keep only relevant fields
clumped_dat_subset <- clumped_dat %>%
dplyr::select(
SNP,
effect_allele.exposure, other_allele.exposure,
effect_allele.outcome, other_allele.outcome,
beta.exposure, se.exposure, pval.exposure,
beta.outcome, se.outcome, pval.outcome,
eaf.exposure, eaf.outcome,
id.exposure, exposure,
id.outcome, outcome,
samplesize.exposure, samplesize.outcome,
mr_keep, action,
F_stat,
steiger_dir, steiger_pval
)
# Step 3: Perform Wald Ratio Tests for Each Instrument
wald_ratios <- clumped_dat %>%
mutate(
wald_beta = beta.outcome / beta.exposure,
wald_se = sqrt((se.outcome^2 / beta.exposure^2) + ((beta.outcome^2 * se.exposure^2) / (beta.exposure^4))),
pval = 2 * pnorm(abs(wald_beta / wald_se), lower.tail = FALSE),
method = paste("Wald Ratio:", SNP)
) %>%
dplyr::select(SNP, wald_beta, wald_se, pval, method)
cat("\n=== Wald Ratio Tests for Each Instrument ===\n")
print(wald_ratios)
# Step 4: Bind Results
mr_results <- bind_rows(
wald_ratios %>% dplyr::select(method, b = wald_beta, se = wald_se, pval)
) %>%
mutate(method = as.factor(method)) %>%
filter(!is.na(method) & method != "NA") %>%
arrange(b) %>%
mutate(method = factor(method, levels = unique(method)))
# Step 5: Create Forest Plot
base_colors <- c("wald_ratios" = "#33A02C")
wald_methods <- unique(wald_ratios$method)
n_wald <- length(wald_methods)
if (n_wald > 0) {
wald_colors <- hue_pal()(n_wald)
names(wald_colors) <- wald_methods
} else {
wald_colors <- NULL
}
color_list <- c(base_colors, wald_colors)
available_methods <- unique(mr_results$method)
color_list <- color_list[names(color_list) %in% available_methods]
forest_plot <- ggplot(wald_ratios, aes(x = wald_beta, y = method, color = method)) +
geom_point(size = 3) +
geom_errorbarh(aes(xmin = wald_beta - 1.96 * wald_se, xmax = wald_beta + 1.96 * wald_se), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
labs(
title = "Mendelian Randomization Estimates:\n N-acetylguanine (1MB around ACY1 TSS) on Jurgens BMI",
x = "Causal Effect (Beta)",
y = ""
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
legend.position = "none",
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA)
) +
scale_color_manual(values = color_list) +
xlim(-0.15, 0.05)
# Step 6: Save Results and Plot
today <- Sys.Date()
results_dir <- "/Users/charleenadams/mr_nag_bmi/results_finngen_p5E6/"
excel_file <- paste0(results_dir, "MR_Results_ACY1_finngen_BMI_", today, ".xlsx")
plot_file <- paste0(results_dir, "MR_Forestplot_ACY1_finngen_BMI_", today, ".png")
# Save Wald ratios to CSV
write.csv(wald_ratios, file = paste0(results_dir, "wald_ratios_", today, ".csv"), row.names = FALSE)
# Save forest plot as PNG
ggsave(plot_file, plot = forest_plot, width = 10, height = 8, dpi = 300)
cat("Results and plot saved successfully!")