With: relaxed pval (5E-6) and clump = 500
rm(list = ls())
# Load Required Libraries
# Mendelian Randomization
library(TwoSampleMR) # Core package for Mendelian Randomization (MR) analyses
library(MendelianRandomization) # Additional MR methods, including MR-Lasso
library(ieugwasr) # For local LD clumping with ld_clump()
library(MRInstruments) # For proxy SNP lookup (if needed)
# Data Wrangling & Handling
library(dplyr) # Data manipulation (filtering, joining, summarizing)
library(tidyr) # Data reshaping
library(data.table) # Fast and efficient data handling
library(readxl) # Reading Excel files
library(openxlsx) # Writing and formatting Excel files efficiently
# Statistical & Visualization
library(ggplot2) # For plotting
library(ggrepel) # Improved text labeling in plots
library(corrplot) # Visualization of correlation matrices
library(RhpcBLASctl) # Control multithreading for efficiency in MR analyses
library(biomaRt) # Querying Ensembl for gene annotation
library(scales) # For dynamic color generation
# Set Working Directory & Setup Folder for Project
setwd("/Users/charleenadams/temp_BI/mr_nat_pter_bmi")
if (!dir.exists("finngen_rep_P06")) dir.create("finngen_rep_P06", recursive = TRUE)
# ---------------------------------------------
# Load and Format NAT Data
# ---------------------------------------------
nat <- fread("/Users/charleenadams/temp_BI/mr_nat_pter_bmi/C100005466.uncompressed") %>% as.data.frame()
filtered_nat_df <- nat %>%
filter(!is.na(rsids) & rsids != "" & grepl("^rs", rsids)) %>%
arrange(rsids)
exp_dat <- filtered_nat_df %>%
mutate(
chr.exposure = chrom,
pos.exposure = pos,
beta.exposure = beta,
se.exposure = sebeta,
exposure = "N-acetyltaurine",
id.exposure = "N-acetyltaurine",
pval.exposure = pval,
SNP.exposure = rsids,
SNP = rsids,
effect_allele.exposure = alt,
other_allele.exposure = ref,
eaf.exposure = maf,
samplesize.exposure = 6099,
id_col = nearest_genes
)
# ---------------------------------------------
# Load and Format BMI Data (FinnGen)
# ---------------------------------------------
bmi <- fread("/Users/charleenadams/temp_BI/mr_nat_pter_bmi/summary_stats_release_finngen_R12_BMI_IRN")
filtered_bmi_df <- bmi %>%
filter(!is.na(rsids) & rsids != "" & grepl("^rs", rsids)) %>%
arrange(rsids) %>%
rename(CHR = `#chrom`)
out_dat <- filtered_bmi_df %>%
mutate(
SNP = rsids,
SNP.outcome = rsids,
chr.outcome = CHR,
pos.outcome = pos,
beta.outcome = beta,
se.outcome = sebeta,
pval.outcome = pval,
effect_allele.outcome = alt,
other_allele.outcome = ref,
eaf.outcome = af_alt,
samplesize.outcome = 500348,
id.outcome = "FinnGen BMI",
outcome = "FinnGen BMI",
id_col = "nearest_genes"
)
# ---------------------------------------------
# Harmonize Data
# ---------------------------------------------
dat <- harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat)
today <- Sys.Date()
write.csv(dat, paste0("/Users/charleenadams/temp_BI/mr_nat_pter_bmi/harmonized_nat_finngen_BMI_dat_", today, ".csv"), row.names = FALSE)
# ---------------------------------------------
# Select 1MB around PTER region
# ---------------------------------------------
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
pter_data <- getBM(attributes = c("ensembl_gene_id", "external_gene_name", "chromosome_name",
"transcription_start_site", "strand"),
filters = "external_gene_name", values = "PTER", mart = ensembl)
pter_tss <- min(pter_data$transcription_start_site)
pter_chrom <- pter_data$chromosome_name[1]
window_size <- 500000
tss_start <- pter_tss - window_size
tss_end <- pter_tss + window_size
dat_filtered <- dat %>%
filter(chr.outcome == pter_chrom & pos.outcome >= tss_start & pos.outcome <= tss_end)
write.csv(dat_filtered, file = "/Users/charleenadams/temp_BI/mr_nat_pter_bmi/filtered_PTER_1Mb_finngen_BMI_2025-02-19.csv", row.names = FALSE)
cat("Filtered data saved to: /Users/charleenadams/temp_BI/mr_nat_pter_bmi/filtered_PTER_1Mb_finngen_BMI_2025-02-19.csv\n")
cat("Number of observations within 1 Mb of PTER TSS:", nrow(dat_filtered), "\n")
# ---------------------------------------------
# MR
# ---------------------------------------------
# Step 1: Filter Instruments
dat_filtered <- read.csv("/Users/charleenadams/temp_BI/mr_nat_pter_bmi/filtered_PTER_1Mb_finngen_BMI_2025-02-19.csv")
instruments <- dat_filtered %>%
filter(mr_keep == TRUE, pval.exposure < 5e-6) %>% # Relaxed threshold
mutate(
rsid = SNP,
pval = pval.exposure,
id = "N-acetyltaurine (1MB around PTER TSS)",
F_stat = (beta.exposure / se.exposure)^2 # Compute F-statistic
) %>%
filter(F_stat > 10) # Exclude weak instruments
cat("Number of instruments after F-stat filtering:", nrow(instruments), "\n")
# Step 1.5: Steiger Filtering
instruments <- steiger_filtering(instruments)
instruments <- instruments %>% filter(steiger_dir == TRUE) # Keep only SNPs with correct direction
cat("Number of instruments after Steiger filtering:", nrow(instruments), "\n")
cat("Preview of Steiger-filtered instruments:\n")
print(head(instruments))
# Subset instruments to keep only TwoSampleMR, F-stat, and Steiger fields
instruments_subset <- instruments %>%
dplyr::select(
SNP,
effect_allele.exposure, other_allele.exposure,
effect_allele.outcome, other_allele.outcome,
beta.exposure, se.exposure, pval.exposure,
beta.outcome, se.outcome, pval.outcome,
eaf.exposure, eaf.outcome,
id.exposure, exposure,
id.outcome, outcome,
samplesize.exposure, samplesize.outcome,
mr_keep, action,
F_stat,
steiger_dir, steiger_pval
)
# Use the same instru
snps=c("rs117372132",
"rs142238737",
"rs3802555",
"rs4747286",
"rs7075357")
test <- instruments_subset[which(instruments_subset$SNP%in%snps),]
clumped_dat <- test
# We wil skip clumping since we are using the same set of independent instruments as was chosen for the Jurgens
# # Step 2: Local LD Clumping
# clumped <- ld_clump(
# dplyr::tibble(rsid = instruments$rsid, pval = instruments$pval, id = instruments$id),
# plink_bin = genetics.binaRies::get_plink_binary(),
# bfile = "/Users/charleenadams/1000G_bfiles/EUR/EUR",
# clump_r2 = 0.001,
# clump_kb = 500 # Reduced window
# )
# clumped_dat <- instruments %>% dplyr::filter(SNP %in% clumped$rsid)
# cat("Local clumping completed. Number of SNPs retained:", nrow(clumped_dat), "\n")
# cat("Preview of clumped data:\n")
# print(head(clumped_dat))
#
# # Subset clumped_dat to keep only TwoSampleMR, F-stat, and Steiger fields
# clumped_dat_subset <- clumped_dat %>%
# dplyr::select(
# SNP,
# effect_allele.exposure, other_allele.exposure,
# effect_allele.outcome, other_allele.outcome,
# beta.exposure, se.exposure, pval.exposure,
# beta.outcome, se.outcome, pval.outcome,
# eaf.exposure, eaf.outcome,
# id.exposure, exposure,
# id.outcome, outcome,
# samplesize.exposure, samplesize.outcome,
# mr_keep, action,
# F_stat,
# steiger_dir, steiger_pval
# )
# Step 3.0: Perform Many MR Analyses at Once
mr_result <- mr_wrapper(clumped_dat)
# Extract the nested list components from mr_wrapper
estimates <- mr_result$`N-acetyltaurine.FinnGen BMI`$estimates
heterogeneity <- mr_result$`N-acetyltaurine.FinnGen BMI`$heterogeneity
directional_pleiotropy <- mr_result$`N-acetyltaurine.FinnGen BMI`$directional_pleiotropy
info <- mr_result$`N-acetyltaurine.FinnGen BMI`$info
snps_retained <- mr_result$`N-acetyltaurine.FinnGen BMI`$snps_retained
# Print the structure to confirm the data is there
print(estimates)
print(heterogeneity)
print(directional_pleiotropy)
print(info)
print(snps_retained)
# Step 3: Perform Selected MR Analyses
# 3.1: Inverse Variance Weighted (IVW)
ivw_result <- mr(clumped_dat, method_list = "mr_ivw")
# 3.2: MR-Egger
egger_result <- mr(clumped_dat, method_list = "mr_egger_regression")
# 3.3: Weighted Median
weighted_median_result <- mr(clumped_dat, method_list = "mr_weighted_median")
# 3.4: MR-Lasso
mr_input <- mr_input(
bx = clumped_dat$beta.exposure,
bxse = clumped_dat$se.exposure,
by = clumped_dat$beta.outcome,
byse = clumped_dat$se.outcome,
exposure = "N-acetyltaurine",
outcome = "BMI",
snps = clumped_dat$SNP
)
lasso_result <- tryCatch({
mr_lasso(mr_input)
}, error = function(e) {
cat("Error in MR-Lasso:", conditionMessage(e), "\n")
NULL
})
# 3.5: Contamination Mixture (ConMix)
conmix_result <- tryCatch({
mr_conmix(mr_input)
}, error = function(e) {
cat("Error in MR-ConMix:", conditionMessage(e), "\n")
NULL
})
# 3.6: Heterogeneity Test
heterogeneity_result <- mr_heterogeneity(clumped_dat)
# 3.7: Pleiotropy Test
pleiotropy_result <- mr_pleiotropy_test(clumped_dat)
# 3.8: Leave-One-Out Analysis
loo_result <- mr_leaveoneout(clumped_dat)
# Step 4: Perform Wald Ratio Tests for Each Instrument
wald_ratios <- clumped_dat %>%
mutate(
wald_beta = beta.outcome / beta.exposure,
wald_se = sqrt((se.outcome^2 / beta.exposure^2) + ((beta.outcome^2 * se.exposure^2) / (beta.exposure^4))),
pval = 2 * pnorm(abs(wald_beta / wald_se), lower.tail = FALSE),
method = paste("Wald Ratio:", SNP)
) %>%
dplyr::select(SNP, wald_beta, wald_se, pval, method)
cat("\n=== Wald Ratio Tests for Each Instrument ===\n")
print(wald_ratios)
# Step 5: Prepare Data for Forest Plot
ivw_df <- ivw_result %>% mutate(method = "IVW")
egger_df <- egger_result %>% mutate(method = "MR-Egger")
weighted_median_df <- weighted_median_result %>% mutate(method = "Weighted Median")
lasso_df <- if (!is.null(lasso_result)) {
data.frame(method = "MR-Lasso", b = lasso_result@Estimate,
se = lasso_result@StdError, pval = lasso_result@Pvalue)
} else {
NULL
}
conmix_se <- if (!is.null(conmix_result)) {
(conmix_result@CIUpper - conmix_result@CILower) / (2 * 1.96) # SE = (CIUpper - CILower) / (2 * 1.96) for 95% CI
} else {
NA
}
conmix_df <- if (!is.null(conmix_result)) {
data.frame(method = "MR-ConMix", b = conmix_result@Estimate,
se = conmix_se, pval = conmix_result@Pvalue)
} else {
NULL
}
mr_results <- bind_rows(
ivw_df,
egger_df,
weighted_median_df,
lasso_df,
conmix_df,
wald_ratios %>% dplyr::select(method, b = wald_beta, se = wald_se, pval)
) %>%
mutate(method = as.factor(method)) %>%
filter(!is.na(method) & method != "NA") %>%
arrange(b) %>%
mutate(method = factor(method, levels = unique(method)))
# Step 6: Create Beautiful Forest Plot with Dynamic Colors (No Numbers)
base_colors <- c(
"IVW" = "#1F78B4",
"MR-Egger" = "#FF7F00",
"Weighted Median" = "#33A02C",
"MR-Lasso" = "#FB9A99",
"MR-ConMix" = "#E41A1C"
)
wald_methods <- unique(wald_ratios$method)
n_wald <- length(wald_methods)
if (n_wald > 0) {
wald_colors <- hue_pal()(n_wald)
names(wald_colors) <- wald_methods
} else {
wald_colors <- NULL
}
color_list <- c(base_colors, wald_colors)
available_methods <- unique(mr_results$method)
color_list <- color_list[names(color_list) %in% available_methods]
# Dynamically determine x-axis limits based on confidence intervals
x_min <- min(mr_results$b - 1.96 * mr_results$se, na.rm = TRUE)
x_max <- max(mr_results$b + 1.96 * mr_results$se, na.rm = TRUE)
forest_plot <- ggplot(mr_results, aes(x = b, y = method, color = method)) +
# Plot error bars FIRST to prevent them from being hidden
geom_errorbarh(aes(xmin = b - 1.96 * se, xmax = b + 1.96 * se),
height = 0.2, na.rm = TRUE, linewidth = 0.8) +
# Plot points over error bars
geom_point(size = 3) +
# Add reference line at zero
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
# Labels and title
labs(
title = "Mendelian Randomization Estimates:\n N-acetyltaurine (1MB around PTER TSS)\n on FinnGen BMI",
x = "Causal Effect (Beta)",
y = ""
) +
# Styling
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
legend.position = "none",
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA)
) +
scale_color_manual(values = color_list) +
# Adjust x-limits dynamically based on data
xlim(x_min - 0.02, x_max + 0.02)
# Print plot to check output
print(forest_plot)
# Step 7.0: Save Results and Plot
today <- Sys.Date()
results_dir <- "/Users/charleenadams/temp_BI/mr_nat_pter_bmi/finngen_rep_P06/"
excel_file <- paste0(results_dir, "MR_Results_PTER_FinnGen_BMI_", today, ".xlsx")
plot_file <- paste0(results_dir, "MR_Forestplot_PTER_FinnGen_BMI_", today, ".png")
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
cat("Created directory:", results_dir, "\n")
}
# Step 7.0.1: Save Same Plot but Ordered by Methods
# Generate the reordered forest plot by method
# Dynamically determine x-axis limits based on confidence intervals
x_min <- min(mr_results$b - 1.96 * mr_results$se, na.rm = TRUE)
x_max <- max(mr_results$b + 1.96 * mr_results$se, na.rm = TRUE)
# Explicitly set the correct order by method name
desired_order <- c(
"IVW",
"MR-Egger",
"MR-Lasso",
"MR-ConMix",
"Weighted Median",
"Wald Ratio: rs117372132",
"Wald Ratio: rs142238737",
"Wald Ratio: rs3802555",
"Wald Ratio: rs4747286",
"Wald Ratio: rs7075357"
)
# Force the order in the dataset itself
mr_results$method <- factor(mr_results$method, levels = desired_order)
# Sort the dataset based on the new factor levels BEFORE plotting
mr_results <- mr_results[order(mr_results$method), ]
forest_plot_by_method <- ggplot(mr_results, aes(x = b, y = method, color = method)) +
# Plot error bars FIRST to prevent them from being hidden
geom_errorbarh(aes(xmin = b - 1.96 * se, xmax = b + 1.96 * se),
height = 0.2, na.rm = TRUE, linewidth = 0.8) +
# Plot points over error bars
geom_point(size = 3) +
# Add reference line at zero
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
# Labels and title
labs(
title = "Mendelian Randomization Estimates:\n N-acetyltaurine (1MB around PTER TSS)\non FinnGen BMI",
x = "Causal Effect (Beta)",
y = ""
) +
# Styling
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
legend.position = "none",
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA)
) +
scale_color_manual(values = color_list) +
# Adjust x-limits dynamically based on data
xlim(x_min - 0.02, x_max + 0.02)
# Print plot to check output
print(forest_plot_by_method)
# Save the reordered plot
plot_file_by_method <- paste0(results_dir, "MR_Forestplot_PTER_FinnGen_BMI_By_Method_", today, ".png")
ggsave(plot_file_by_method, plot = forest_plot_by_method, dpi = 300, width = 8, height = 5)
# 7.1: Save Everything in One Excel Spreadsheet
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
add_formatted_sheet <- function(wb, sheet_name, data, title) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setRowHeights(wb, sheet_name, rows = 1, heights = 20)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
# TOC with all sections
toc_data <- data.frame(
Sheet = c("Estimates", "Heterogeneity", "Directional_Pleiotropy", "Info", "SNPs_Retained",
"IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out",
"Instruments", "Clumped_Data"),
Title = c("MR Causal Estimates (mr_wrapper)", "Heterogeneity Results (mr_wrapper)",
"Directional Pleiotropy Results (mr_wrapper)", "Summary Info (mr_wrapper)",
"SNPs Retained (mr_wrapper)",
"Inverse Variance Weighted (IVW) Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests for Each Instrument", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis Results",
"Filtered Instruments Data", "Clumped SNPs Data")
)
toc_data <- toc_data[complete.cases(toc_data), ] # Remove any NA rows (e.g., if lasso/conmix are NULL)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
# Add all sheets
add_formatted_sheet(wb, "Estimates", estimates, "MR Causal Estimates for N-acetyltaurine on BMI (mr_wrapper)")
add_formatted_sheet(wb, "Heterogeneity", heterogeneity, "Heterogeneity Results (mr_wrapper)")
add_formatted_sheet(wb, "Directional_Pleiotropy", directional_pleiotropy, "Directional Pleiotropy Results (Egger Intercept, mr_wrapper)")
add_formatted_sheet(wb, "Info", info, "Summary Information and Diagnostics (mr_wrapper)")
add_formatted_sheet(wb, "SNPs_Retained", snps_retained, "SNPs Retained After Filtering (mr_wrapper)")
add_formatted_sheet(wb, "IVW", ivw_result, "Inverse Variance Weighted (IVW) Results")
add_formatted_sheet(wb, "MR-Egger", egger_result, "MR-Egger Results")
add_formatted_sheet(wb, "Weighted_Median", weighted_median_result, "Weighted Median Results")
if (!is.null(lasso_result)) {
lasso_df <- data.frame(
Exposure = lasso_result@Exposure,
Outcome = lasso_result@Outcome,
Estimate = lasso_result@Estimate,
StdError = lasso_result@StdError,
CILower = lasso_result@CILower,
CIUpper = lasso_result@CIUpper,
Pvalue = lasso_result@Pvalue,
SNPs = lasso_result@SNPs,
Valid = lasso_result@Valid,
ValidSNPs = if (length(lasso_result@ValidSNPs) > 0) paste(lasso_result@ValidSNPs, collapse = ", ") else "None",
RegEstimate = lasso_result@RegEstimate,
RegIntercept = paste(lasso_result@RegIntercept, collapse = ", "),
Lambda = lasso_result@Lambda
)
add_formatted_sheet(wb, "MR-Lasso", lasso_df, "MR-Lasso Results")
} else {
addWorksheet(wb, "MR-Lasso")
writeData(wb, "MR-Lasso", "MR-Lasso Analysis Failed", startRow = 1, startCol = 1)
addStyle(wb, "MR-Lasso", title_style, rows = 1, cols = 1)
}
if (!is.null(conmix_result)) {
conmix_df <- data.frame(
Exposure = conmix_result@Exposure,
Outcome = conmix_result@Outcome,
Estimate = conmix_result@Estimate,
Pvalue = conmix_result@Pvalue,
SNPs = conmix_result@SNPs,
Psi = conmix_result@Psi,
CILower = conmix_result@CILower,
CIUpper = conmix_result@CIUpper,
CIRange = paste(conmix_result@CIRange, collapse = ", "),
CIMin = conmix_result@CIMin,
CIMax = conmix_result@CIMax,
CIStep = conmix_result@CIStep,
Valid = paste(conmix_result@Valid, collapse = ", "),
ValidSNPs = paste(conmix_result@ValidSNPs, collapse = ", "),
Alpha = conmix_result@Alpha
)
add_formatted_sheet(wb, "MR-ConMix", conmix_df, "MR-ConMix Results")
} else {
addWorksheet(wb, "MR-ConMix")
writeData(wb, "MR-ConMix", "MR-ConMix Analysis Failed", startRow = 1, startCol = 1)
addStyle(wb, "MR-ConMix", title_style, rows = 1, cols = 1)
}
add_formatted_sheet(wb, "Wald_Ratios", wald_ratios, "Wald Ratio Tests for Each Instrument")
add_formatted_sheet(wb, "Heterogeneity_Test", heterogeneity_result, "Heterogeneity Test Results")
add_formatted_sheet(wb, "Pleiotropy_Test", pleiotropy_result, "Pleiotropy Test Results")
add_formatted_sheet(wb, "Leave_One_Out", loo_result, "Leave-One-Out Analysis Results")
add_formatted_sheet(wb, "Instruments", instruments_subset, "Filtered Instruments Data")
add_formatted_sheet(wb, "Clumped_Data", clumped_dat, "Clumped SNPs Data")
saveWorkbook(wb, excel_file, overwrite = TRUE)
cat("Excel file saved to:", excel_file, "\n")
# 7.2: Save Forest Plot
ggsave(plot_file, forest_plot, width = 12, height = 10, dpi = 600, bg = "white")
cat("Forest plot saved to:", plot_file, "\n")