cd /Volumes/LaCie/tahir/mr_ukbppp_bp
wget https://gwas.mrcieu.ac.uk/files/ieu-b-38/ieu-b-38.vcf.gz
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ieu-b-40.vcf.sh nohup ./format_ieu-b-40.vcf.sh > format_ieu-b-40.vcf.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-b-40.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ieu-b-40.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-b-40_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-b-40.vcf >> "$outfile"
/pub/databases/gwas/summary_statistics/GCST90271001-GCST90272000/GCST90271557/harmonised/#!/bin/bash
# Step 1: Navigate to the directory containing the file
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
# Step 2: Define the input and output file names
input_file="GCST90271557.h.tsv.gz"
output_file="GCST90271557_formatted_with_originals.tsv"
# Step 3: Use gzcat instead of zcat (compatible with macOS)
gzcat "$input_file" | head -n 1 | awk '{print $0"\tCHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL"}' > "$output_file"
# Step 4: Process the file, appending formatted columns without altering original data
gzcat "$input_file" | awk 'NR > 1 {
# Preserve original columns
chr = $1;
bp = $2;
ref = $4;
alt = $3;
eaf = $7;
beta = $5;
se = $6;
pval = $8;
rsid = ($10 != "" ? $10 : $9);
# Calculate LP (log10 p-value) only for valid p-values
lp = (pval ~ /^[0-9.]+$/ ? -log(pval) / log(10) : "NA");
# Print all original columns plus the formatted columns
print $0, chr, bp, rsid, ref, alt, eaf, beta, se, lp, pval
}' OFS='\t' >> "$output_file"
echo "Reformatted file with original columns saved to: $output_file"
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ukb-b-19393.sh nohup ./format_ukb-b-19393.sh > format_ukb-b-19393.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ukb-b-19393.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ukb-b-19393.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ukb-b-19393_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ukb-b-19393.vcf >> "$outfile"
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ieu-a-302.sh nohup ./format_ieu-a-302.sh > format_ieu-a-302.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-a-302.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ieu-a-302.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-a-302_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-a-302.vcf >> "$outfile"
nano format_DM_bmi_adjusted_unadjusted.sh ./format_DM_bmi_adjusted_unadjusted.sh
#!/bin/bash
# Set the working directory
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
# Define the input files
input_files=("T2D_European.BMIunadjusted.txt" "T2D_European.BMIadjusted.txt")
for input_file in "${input_files[@]}"; do
# Generate output file name
output_file="${input_file%.txt}_formatted.tsv"
# Write the header with your specified columns
echo -e "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL\tN\tCASES\tCONTROLS\tOUTCOME" > "$output_file"
# Process the file, creating new columns only if they don't exist
awk 'NR > 1 {
chr = $3;
bp = $4;
snp = ($1 == "." ? $2 : $1);
ref = $6;
alt = $5;
eaf = $7;
beta = $9;
se = $10;
pval = $11;
lp = (pval > 0 ? -log(pval)/log(10) : "NA");
# Default values for missing columns
n = (NF >= 12 ? $12 : "NA");
cases = (NF >= 13 ? $13 : "NA");
controls = (NF >= 14 ? $14 : "NA");
outcome = (NF >= 15 ? $15 : (FILENAME ~ /BMIadjusted/ ? "T2D_BMIadjusted" : "T2D_BMIunadjusted"));
print chr, bp, snp, ref, alt, eaf, beta, se, lp, pval, n, cases, controls, outcome
}' OFS="\t" "$input_file" >> "$output_file"
echo "Formatted file saved to: $output_file"
done
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ieu-a-7.sh nohup ./format_ieu-a-7.sh > format_ieu-a-7.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-a-7.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ieu-a-7.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-a-7_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-a-7.vcf >> "$outfile"
Rscript ./harmonization_500KB_tss.R
#!/usr/bin/env Rscript
# /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/
# Rscript 2025_05_18_harmonization_mr_aric_500kb_8_outcomes.R
# Clear workspace
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(dplyr)
library(data.table)
library(foreach)
library(doParallel)
library(RhpcBLASctl)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit (128 GB)
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS for single-threaded performance
blas_set_num_threads(1)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic"
pqtl_dir <- file.path(base_dir, "pqtls/EA")
outcome_dir <- file.path(base_dir, "data/formatted")
results_dir <- file.path(base_dir, "harmonization/aric_harmonized_500_tss")
mapping_file <- file.path(base_dir, "pqtls/seqid.txt")
# Create results directory
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
# Verify directories and mapping file
if (!dir.exists(pqtl_dir)) {
stop(sprintf("pQTL directory %s does not exist.", pqtl_dir))
}
if (!dir.exists(outcome_dir)) {
stop(sprintf("Outcome directory %s does not exist.", outcome_dir))
}
if (!file.exists(mapping_file)) {
stop(sprintf("Mapping file %s does not exist.", mapping_file))
}
# Load mapping file
mapping_data <- fread(mapping_file, data.table = FALSE)
cat(sprintf("Loaded mapping file with %d seqids\n", nrow(mapping_data)))
# List pQTL and outcome files
pqtl_files <- list.files(pqtl_dir, pattern = "^SeqId_.*\\.PHENO1\\.glm\\.linear$", full.names = TRUE)
outcome_files <- list.files(outcome_dir, pattern = "_formatted.*\\.tsv$", full.names = TRUE)
cat("Total pQTL files to process:", length(pqtl_files), "\n")
cat("Total outcome files to process:", length(outcome_files), "\n")
# Create all pQTL-outcome pairs for parallel processing
task_pairs <- expand.grid(pqtl_file = pqtl_files, outcome_file = outcome_files, stringsAsFactors = FALSE)
cat("Total tasks (pQTL-outcome pairs):", nrow(task_pairs), "\n")
# Function to harmonize pQTL with an outcome
harmonize_pqtl <- function(pqtl_file, outcome_file, outcome_data, results_dir, mapping_data) {
# Extract seqid from pQTL file name
seqid <- sub("\\.PHENO1\\.glm\\.linear$", "", basename(pqtl_file))
# Get chromosome from mapping data
protein_data <- mapping_data %>% filter(seqid_in_sample == seqid)
if (nrow(protein_data) == 0) {
cat(sprintf("No mapping data for %s; skipping\n", seqid))
return(NULL)
}
protein_chrom <- as.character(protein_data$chromosome_name[1])
# Read and format exposure data (pQTL)
exp_dat <- fread(pqtl_file) %>%
filter(!is.na(ID), grepl("^rs", ID)) %>%
dplyr::select(
SNP = ID,
chr.exposure = `#CHROM`,
pos.exposure = POS,
effect_allele.exposure = A1,
other_allele.exposure = REF,
eaf.exposure = A1_FREQ,
beta.exposure = BETA,
se.exposure = SE,
pval.exposure = P,
samplesize.exposure = OBS_CT
) %>%
mutate(
exposure = seqid,
id.exposure = seqid,
chr.exposure = as.character(chr.exposure)
)
# Extract outcome name from pre-loaded outcome data
outcome_name <- unique(outcome_data$OUTCOME)
if (length(outcome_name) != 1) {
cat(sprintf("Multiple or no OUTCOME values in %s; skipping\n", basename(outcome_file)))
return(NULL)
}
# Format outcome data
out_dat <- outcome_data %>%
filter(CHR == protein_chrom, !is.na(SNP), grepl("^rs", SNP)) %>%
dplyr::select(
SNP = SNP,
chr.outcome = CHR,
pos.outcome = BP,
effect_allele.outcome = ALT,
other_allele.outcome = REF,
eaf.outcome = EAF,
beta.outcome = BETA,
se.outcome = SE,
pval.outcome = PVAL,
samplesize.outcome = N
) %>%
mutate(
outcome = outcome_name,
id.outcome = outcome_name,
chr.outcome = as.character(chr.outcome)
)
# Harmonize data
dat <- tryCatch({
harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat, action = 1)
}, error = function(e) {
cat(sprintf("Harmonization failed for %s with %s: %s\n", seqid, outcome_name, e$message))
return(NULL)
})
if (is.null(dat) || nrow(dat) == 0) {
cat(sprintf("No harmonized data for %s with %s; skipping\n", seqid, outcome_name))
return(NULL)
}
# Clean outcome_name for file naming: replace spaces, parentheses, commas, hyphens, and other special characters with underscores
outcome_name_file <- gsub("[^[:alnum:]]", "_", outcome_name)
# Remove multiple consecutive underscores
outcome_name_file <- gsub("_+", "_", outcome_name_file)
# Remove leading or trailing underscores
outcome_name_file <- gsub("^_|_$", "", outcome_name_file)
# Save harmonized data using fwrite for speed
harmonized_file <- file.path(results_dir, sprintf("tss_harmonized_%s_%s.csv", seqid, outcome_name_file))
fwrite(dat, harmonized_file, row.names = FALSE)
cat(sprintf("Harmonized data saved for %s with %s: %s\n", seqid, outcome_name, harmonized_file))
return(NULL)
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Pre-load outcome data to avoid repeated reads
outcome_data_list <- lapply(outcome_files, function(file) {
cat(sprintf("Pre-loading outcome file: %s\n", basename(file)))
fread(file, nThread = 1)
})
names(outcome_data_list) <- outcome_files
# Process all pQTL-outcome pairs in parallel
foreach(i = 1:nrow(task_pairs), .packages = c("dplyr", "data.table", "TwoSampleMR")) %dopar% {
pqtl_file <- task_pairs$pqtl_file[i]
outcome_file <- task_pairs$outcome_file[i]
outcome_data <- outcome_data_list[[outcome_file]]
harmonize_pqtl(pqtl_file, outcome_file, outcome_data, results_dir, mapping_data)
}
# Clean up
stopImplicitCluster()
cat("Harmonization process completed for all pQTLs and outcomes.\n")
# Had to redo for the T2D files:
#!/usr/bin/env Rscript
# /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/
# Rscript 2025_05_18_harmonization_mr_aric_500kb_T2D_outcomes.R
# Clear workspace
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(dplyr)
library(data.table)
library(foreach)
library(doParallel)
library(RhpcBLASctl)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit (128 GB)
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS for single-threaded performance
blas_set_num_threads(1)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic"
pqtl_dir <- file.path(base_dir, "pqtls/EA")
outcome_dir <- file.path(base_dir, "data/formatted")
results_dir <- file.path(base_dir, "harmonization/aric_harmonized_500kb")
mapping_file <- file.path(base_dir, "pqtls/seqid.txt")
# Log the results directory
cat("Harmonized results will be saved in:", results_dir, "\n")
# Create results directory
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
# Verify directories and mapping file
if (!dir.exists(pqtl_dir)) {
stop(sprintf("pQTL directory %s does not exist.", pqtl_dir))
}
if (!dir.exists(outcome_dir)) {
stop(sprintf("Outcome directory %s does not exist.", outcome_dir))
}
if (!file.exists(mapping_file)) {
stop(sprintf("Mapping file %s does not exist.", mapping_file))
}
# Load mapping file
mapping_data <- fread(mapping_file, data.table = FALSE)
cat(sprintf("Loaded mapping file with %d seqids\n", nrow(mapping_data)))
# List pQTL and outcome files (only T2D outcomes)
pqtl_files <- list.files(pqtl_dir, pattern = "^SeqId_.*\\.PHENO1\\.glm\\.linear$", full.names = TRUE)
outcome_files <- list.files(outcome_dir, pattern = "T2D_European\\.(BMIadjusted|BMIunadjusted)\\.tsv$", full.names = TRUE)
cat("Total pQTL files to process:", length(pqtl_files), "\n")
cat("Total outcome files to process:", length(outcome_files), "\n")
cat("Outcome files:", paste(basename(outcome_files), collapse = ", "), "\n")
# Create all pQTL-outcome pairs for parallel processing
task_pairs <- expand.grid(pqtl_file = pqtl_files, outcome_file = outcome_files, stringsAsFactors = FALSE)
cat("Total tasks (pQTL-outcome pairs):", nrow(task_pairs), "\n")
# Function to harmonize pQTL with an outcome
harmonize_pqtl <- function(pqtl_file, outcome_file, outcome_data, results_dir, mapping_data) {
# Extract seqid from pQTL file name
seqid <- sub("\\.PHENO1\\.glm\\.linear$", "", basename(pqtl_file))
# Get chromosome from mapping data
protein_data <- mapping_data %>% filter(seqid_in_sample == seqid)
if (nrow(protein_data) == 0) {
cat(sprintf("No mapping data for %s; skipping\n", seqid))
return(NULL)
}
protein_chrom <- as.character(protein_data$chromosome_name[1])
# Read and format exposure data (pQTL)
exp_dat <- fread(pqtl_file) %>%
filter(!is.na(ID), grepl("^rs", ID)) %>%
dplyr::select(
SNP = ID,
chr.exposure = `#CHROM`,
pos.exposure = POS,
effect_allele.exposure = A1,
other_allele.exposure = REF,
eaf.exposure = A1_FREQ,
beta.exposure = BETA,
se.exposure = SE,
pval.exposure = P,
samplesize.exposure = OBS_CT
) %>%
mutate(
exposure = seqid,
id.exposure = seqid,
chr.exposure = as.character(chr.exposure)
)
cat(sprintf("Exposure data for %s: %d rows\n", seqid, nrow(exp_dat)))
# Extract outcome name from pre-loaded outcome data
outcome_name <- unique(outcome_data$OUTCOME)
cat(sprintf("Outcome file %s has %d unique OUTCOME values: %s\n",
basename(outcome_file), length(outcome_name), paste(outcome_name, collapse = ", ")))
if (length(outcome_name) != 1) {
cat(sprintf("Multiple or no OUTCOME values in %s; skipping\n", basename(outcome_file)))
return(NULL)
}
# Format outcome data
out_dat <- outcome_data %>%
filter(CHR == protein_chrom, !is.na(SNP), grepl("^rs", SNP)) %>%
dplyr::select(
SNP = SNP,
chr.outcome = CHR,
pos.outcome = BP,
effect_allele.outcome = ALT,
other_allele.outcome = REF,
eaf.outcome = EAF,
beta.outcome = BETA,
se.outcome = SE,
pval.outcome = PVAL,
samplesize.outcome = N
) %>%
mutate(
outcome = outcome_name,
id.outcome = outcome_name,
chr.outcome = as.character(chr.outcome)
)
cat(sprintf("Outcome data for %s with %s (chr %s): %d rows\n",
outcome_name, seqid, protein_chrom, nrow(out_dat)))
# Harmonize data
dat <- tryCatch({
harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat, action = 1)
}, error = function(e) {
cat(sprintf("Harmonization failed for %s with %s: %s\n", seqid, outcome_name, e$message))
return(NULL)
})
if (is.null(dat) || nrow(dat) == 0) {
cat(sprintf("No harmonized data for %s with %s; skipping\n", seqid, outcome_name))
return(NULL)
}
cat(sprintf("Harmonized data for %s with %s: %d rows\n", seqid, outcome_name, nrow(dat)))
# Clean outcome_name for file naming: replace non-alphanumeric with underscores, clean up
outcome_name_file <- gsub("[^[:alnum:]]", "_", outcome_name)
outcome_name_file <- gsub("_+", "_", outcome_name_file)
outcome_name_file <- gsub("^_|_$", "", outcome_name_file)
# Save harmonized data using fwrite for speed
harmonized_file <- file.path(results_dir, sprintf("tss_harmonized_%s_%s.csv", seqid, outcome_name_file))
fwrite(dat, harmonized_file, row.names = FALSE)
cat(sprintf("Harmonized data saved for %s with %s: %s\n", seqid, outcome_name, harmonized_file))
return(NULL)
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Pre-load outcome data to avoid repeated reads
outcome_data_list <- lapply(outcome_files, function(file) {
cat(sprintf("Pre-loading outcome file: %s\n", basename(file)))
fread(file, nThread = 1)
})
names(outcome_data_list) <- outcome_files
# Process all pQTL-outcome pairs in parallel
foreach(i = 1:nrow(task_pairs), .packages = c("dplyr", "data.table", "TwoSampleMR")) %dopar% {
pqtl_file <- task_pairs$pqtl_file[i]
outcome_file <- task_pairs$outcome_file[i]
outcome_data <- outcome_data_list[[outcome_file]]
harmonize_pqtl(pqtl_file, outcome_file, outcome_data, results_dir, mapping_data)
}
# Clean up
stopImplicitCluster()
cat("Harmonization process completed for all pQTLs and T2D outcomes.\n")
Rscript 2025_05_18_parallel_clump_mr_aric_500kb_8_outcomes.R
#!/usr/bin/env Rscript
# /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts
# Rscript 2025_05_18_parallel_clump_mr_aric_500kb_8_outcomes.R
# Clear workspace
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
library(foreach)
library(doParallel)
# Define %||% operator
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit (128 GB)
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS
blas_set_num_threads(1)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic"
harmonized_dir <- file.path(base_dir, "harmonization/aric_harmonized_500_tss")
mr_results_dir <- file.path(base_dir, "final_mr_res_aric_500kb")
if (!dir.exists(harmonized_dir)) stop("Harmonized directory does not exist:", harmonized_dir)
if (!dir.exists(mr_results_dir)) dir.create(mr_results_dir, recursive = TRUE)
# LD clumping thresholds
clump_r2_values <- c(0.001, 0.01, 0.1)
clump_labels <- c("r2_001", "r2_01", "r2_1")
# Define binary outcomes
binary_outcomes <- c("Coronary_Heart_Disease", "T2D_BMIadjusted", "T2D_BMIunadjusted")
# Set up logging
log_file <- file.path(base_dir, "mr_analysis_aric_500KB_tss.log")
sink(log_file, append = TRUE, split = TRUE)
cat(sprintf("Starting MR analysis on %s\n", Sys.time()))
# List harmonized files
harmonized_files <- list.files(harmonized_dir, pattern = "^tss_harmonized_SeqId_.*\\.csv$", full.names = TRUE)
cat(sprintf("Total harmonized files found: %d\n", length(harmonized_files)))
if (length(harmonized_files) == 0) {
cat("No harmonized files found in", harmonized_dir, "\n")
sink()
stop("No harmonized files to process")
}
cat("Harmonized file examples:", paste(basename(head(harmonized_files, 5)), collapse = ", "), "\n")
# Create task list
task_pairs <- expand.grid(
harmonized_file = harmonized_files,
clump_r2_index = seq_along(clump_r2_values),
stringsAsFactors = FALSE
)
cat("Total tasks (pQTL-outcome-r2 pairs):", nrow(task_pairs), "\n")
# Function to perform MR analysis
perform_mr <- function(harmonized_data, seqid, outcome_name, clump_r2, clump_label, is_binary) {
today <- Sys.Date()
output_dir <- file.path(mr_results_dir, clump_label)
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
excel_file <- file.path(output_dir, sprintf("MR_Results_%s_%s_%s_500KB_tss_%s.xlsx", seqid, outcome_name, clump_label, today))
# Skip if Excel file exists
if (file.exists(excel_file)) {
cat(sprintf("Skipping %s with %s (r2 = %s): Existing Excel file found\n", seqid, outcome_name, clump_r2))
return(NULL)
}
# Use pre-loaded harmonized data
dat <- harmonized_data
cat(sprintf("Processing %s with %s (r2 = %s): %d rows\n", seqid, outcome_name, clump_r2, nrow(dat)))
if (nrow(dat) == 0) {
cat(sprintf("No data for %s with %s; skipping\n", seqid, outcome_name))
return(NULL)
}
# Filter instruments
instruments <- dat %>%
filter(mr_keep == TRUE, pval.exposure < 5e-6) %>%
mutate(
rsid = SNP,
pval = pval.exposure,
id = sprintf("%s exposure", seqid),
F_stat = (beta.exposure / se.exposure)^2
) %>%
filter(F_stat > 10)
cat(sprintf("Instruments for %s with %s: %d\n", seqid, outcome_name, nrow(instruments)))
if (nrow(instruments) == 0) {
cat(sprintf("No valid instruments for %s with %s; skipping\n", seqid, outcome_name))
return(NULL)
}
# Steiger filtering
instruments <- tryCatch({
steiger_filtering(instruments)
}, error = function(e) {
cat(sprintf("Steiger filtering failed for %s with %s: %s\n", seqid, outcome_name, e$message))
return(NULL)
})
if (is.null(instruments) || !"steiger_dir" %in% colnames(instruments) || nrow(instruments) == 0) {
cat(sprintf("Steiger filtering failed or no instruments for %s with %s; skipping\n", seqid, outcome_name))
return(NULL)
}
instruments <- instruments %>% filter(steiger_dir == TRUE)
cat(sprintf("Instruments after Steiger filtering: %d\n", nrow(instruments)))
if (nrow(instruments) == 0) return(NULL)
# LD clumping
clumped_dat <- NULL
tryCatch({
clumped <- ld_clump(
dplyr::tibble(rsid = instruments$rsid, pval = instruments$pval, id = instruments$id),
plink_bin = genetics.binaRies::get_plink_binary(),
bfile = "/Users/charleenadams/1000G_bfiles/EUR/EUR", # REPLACE WITH YOUR CORRECT bfile PATH
clump_r2 = clump_r2,
clump_kb = 500
)
clumped_dat <- instruments %>% filter(SNP %in% clumped$rsid)
cat(sprintf("SNPs after clumping (r2 = %s): %d\n", clump_r2, nrow(clumped_dat)))
}, error = function(e) {
cat(sprintf("LD clumping failed for %s with %s (r2 = %s): %s\n", seqid, outcome_name, clump_r2, e$message))
return(NULL)
})
if (is.null(clumped_dat) || nrow(clumped_dat) == 0) {
cat(sprintf("No SNPs after clumping for %s with %s (r2 = %s); skipping\n", seqid, outcome_name, clump_r2))
return(NULL)
}
# Perform MR analyses
ivw_result <- tryCatch(mr(clumped_dat, method_list = "mr_ivw"), error = function(e) {
cat(sprintf("IVW failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
egger_result <- if (nrow(clumped_dat) >= 3) {
tryCatch(mr(clumped_dat, method_list = "mr_egger_regression"), error = function(e) {
cat(sprintf("MR-Egger failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
} else NULL
weighted_median_result <- tryCatch(mr(clumped_dat, method_list = "mr_weighted_median"), error = function(e) {
cat(sprintf("Weighted Median failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
mr_input <- tryCatch({
mr_input(
bx = clumped_dat$beta.exposure,
bxse = clumped_dat$se.exposure,
by = clumped_dat$beta.outcome,
byse = clumped_dat$se.outcome,
exposure = seqid,
outcome = outcome_name,
snps = clumped_dat$SNP
)
}, error = function(e) {
cat(sprintf("MR input failed for %s with %s: %s\n", seqid, outcome_name, e$message))
return(NULL)
})
if (is.null(mr_input)) return(NULL)
lasso_result <- tryCatch(mr_lasso(mr_input), error = function(e) {
cat(sprintf("MR-Lasso failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
conmix_result <- tryCatch(mr_conmix(mr_input), error = function(e) {
cat(sprintf("MR-ConMix failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
heterogeneity_result <- tryCatch(mr_heterogeneity(clumped_dat), error = function(e) {
cat(sprintf("Heterogeneity test failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
pleiotropy_result <- if (nrow(clumped_dat) >= 3) tryCatch(mr_pleiotropy_test(clumped_dat), error = function(e) {
cat(sprintf("Pleiotropy test failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
}) else NULL
loo_result <- tryCatch(mr_leaveoneout(clumped_dat), error = function(e) {
cat(sprintf("Leave-one-out failed for %s with %s: %s\n", seqid, outcome_name, e$message))
NULL
})
# Calculate Wald ratios with id.exposure and id.outcome
wald_ratios <- clumped_dat %>%
mutate(
wald_beta = beta.outcome / beta.exposure,
wald_se = sqrt((se.outcome^2 / beta.exposure^2) + ((beta.outcome^2 * se.exposure^2) / (beta.exposure^4))),
pval = 2 * pnorm(abs(wald_beta / wald_se), lower.tail = FALSE),
method = paste("Wald Ratio:", SNP),
id.exposure = seqid,
id.outcome = outcome_name
) %>%
dplyr::select(id.exposure, id.outcome, SNP, wald_beta, wald_se, pval, method)
# # Process MR results for forest plot
# mr_results <- bind_rows(
# if (!is.null(ivw_result) && nrow(ivw_result) > 0) ivw_result[, c("method", "b", "se", "pval")],
# if (!is.null(egger_result) && nrow(egger_result) > 0) egger_result[, c("method", "b", "se", "pval")],
# if (!is.null(weighted_median_result) && nrow(weighted_median_result) > 0) weighted_median_result[, c("method", "b", "se", "pval")],
# if (!is.null(lasso_result)) data.frame(
# method = "MR-Lasso",
# b = lasso_result@Estimate,
# se = lasso_result@StdError,
# pval = lasso_result@Pvalue
# ),
# if (!is.null(conmix_result)) data.frame(
# method = "MR-ConMix",
# b = conmix_result@Estimate,
# se = (conmix_result@CIUpper - conmix_result@CILower)/(2*1.96),
# pval = conmix_result@Pvalue
# ),
# wald_ratios %>% dplyr::select(method, b = wald_beta, se = wald_se, pval)
# ) %>%
# mutate(
# method = factor(method, levels = unique(method)),
# estimate = b,
# estimate_lower = b - 1.96 * se,
# estimate_upper = b + 1.96 * se
# ) %>%
# na.omit()
#
# if (nrow(mr_results) == 0) {
# cat(sprintf("No MR results for %s with %s (r2 = %s); skipping\n", seqid, outcome_name, clump_r2))
# return(NULL)
# }
#
# # Define colors for forest plot
# base_colors <- c("IVW" = "#1F78B4", "MR-Egger" = "#FF7F00", "Weighted Median" = "#33A02C",
# "MR-Lasso" = "#FB9A99", "MR-ConMix" = "#E41A1C")
# wald_methods <- unique(mr_results$method[grepl("Wald Ratio", mr_results$method)])
# wald_colors <- if (length(wald_methods) > 0) setNames(hue_pal()(length(wald_methods)), wald_methods) else NULL
# color_list <- c(base_colors, wald_colors)
# color_list <- color_list[names(color_list) %in% unique(mr_results$method)]
#
# # Create forest plot
# x_min <- min(mr_results$estimate_lower, na.rm = TRUE)
# x_max <- max(mr_results$estimate_upper, na.rm = TRUE)
# x_label <- if (is_binary) "Causal Effect (Log Odds)" else "Causal Effect (Beta)"
# forest_plot <- ggplot(mr_results, aes(x = estimate, y = method, color = method)) +
# geom_errorbarh(aes(xmin = estimate_lower, xmax = estimate_upper), height = 0.2, na.rm = TRUE, linewidth = 0.8) +
# geom_point(size = 3) +
# geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
# labs(
# title = sprintf("Mendelian Randomization Estimates:\n %s on %s (r2 = %s, 500KB TSS)", seqid, outcome_name, clump_r2),
# x = Rust,
# y = ""
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
# axis.text.y = element_text(size = 12, face = "bold"),
# axis.text.x = element_text(size = 10),
# axis.title.x = element_text(size = 12),
# panel.grid.major = element_line(color = "grey90"),
# panel.grid.minor = element_blank(),
# legend.position = "none",
# panel.background = element_rect(fill = "white", color = NA),
# plot.background = element_rect(fill = "white", color = NA)
# ) +
# scale_color_manual(values = color_list) +
# xlim(x_min - 0.02 * (x_max - x_min), x_max + 0.02 * (x_max - x_min))
# Save outputs to Excel
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || !is.data.frame(data) || nrow(data) == 0 || ncol(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out", "Clumped_Data"),
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
add_formatted_sheet(wb, "IVW", ivw_result, "Inverse Variance Weighted Results")
add_formatted_sheet(wb, "MR-Egger", egger_result, "MR-Egger Results")
add_formatted_sheet(wb, "Weighted_Median", weighted_median_result, "Weighted Median Results")
if (!is.null(lasso_result)) {
lasso_df <- data.frame(
id.exposure = seqid,
id.outcome = outcome_name,
Exposure = lasso_result@Exposure,
Outcome = lasso_result@Outcome,
Estimate = lasso_result@Estimate,
StdError = lasso_result@StdError,
CILower = lasso_result@CILower,
CIUpper = lasso_result@CIUpper,
Pvalue = lasso_result@Pvalue,
SNPs = lasso_result@SNPs,
Valid = lasso_result@Valid,
ValidSNPs = paste(lasso_result@ValidSNPs, collapse = ", ")
)
add_formatted_sheet(wb, "MR-Lasso", lasso_df, "MR-Lasso Results")
}
if (!is.null(conmix_result)) {
conmix_df <- data.frame(
id.exposure = seqid,
id.outcome = outcome_name,
Exposure = conmix_result@Exposure,
Outcome = conmix_result@Outcome,
Estimate = conmix_result@Estimate,
Pvalue = conmix_result@Pvalue,
CILower = conmix_result@CILower,
CIUpper = conmix_result@CIUpper,
SNPs = conmix_result@SNPs
)
add_formatted_sheet(wb, "MR-ConMix", conmix_df, "MR-ConMix Results")
}
add_formatted_sheet(wb, "Wald_Ratios", wald_ratios, "Wald Ratio Tests")
add_formatted_sheet(wb, "Heterogeneity_Test", heterogeneity_result, "Heterogeneity Test Results")
add_formatted_sheet(wb, "Pleiotropy_Test", pleiotropy_result, "Pleiotropy Test Results")
add_formatted_sheet(wb, "Leave_One_Out", loo_result, "Leave-One-Out Analysis")
add_formatted_sheet(wb, "Clumped_Data", clumped_dat, "Clumped SNPs Data")
saveWorkbook(wb, excel_file, overwrite = FALSE)
cat(sprintf("Results saved for %s with %s (r2 = %s): %s\n", seqid, outcome_name, clump_r2, excel_file))
return(list(
ivw = ivw_result, egger = egger_result, weighted_median = weighted_median_result,
lasso = lasso_result, conmix = conmix_result, wald_ratios = wald_ratios,
heterogeneity = heterogeneity_result, pleiotropy = pleiotropy_result,
loo = loo_result, clumped = clumped_dat,
seqid = seqid, outcome = outcome_name, r2_value = clump_label
))
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Process tasks in parallel
master_data <- foreach(i = 1:nrow(task_pairs), .packages = c("dplyr", "tidyr", "data.table", "TwoSampleMR", "MendelianRandomization", "ieugwasr", "openxlsx", "genetics.binaRies"), .combine = c) %dopar% {
harmonized_file <- task_pairs$harmonized_file[i]
clump_r2_index <- task_pairs$clump_r2_index[i]
clump_r2 <- clump_r2_values[clump_r2_index]
clump_label <- clump_labels[clump_r2_index]
# Extract seqid and outcome
file_name <- basename(harmonized_file)
pattern <- "^tss_harmonized_(SeqId_[0-9]+_[0-9]+)_(.*)\\.csv$"
seqid <- tryCatch(sub(pattern, "\\1", file_name), error = function(e) NULL)
outcome_name <- tryCatch(sub(pattern, "\\2", file_name), error = function(e) NULL)
if (is.null(seqid) || is.null(outcome_name)) {
cat(sprintf("Failed to parse %s; skipping\n", file_name))
return(NULL)
}
# Clean outcome name for binary detection
outcome_name_clean <- gsub("[^[:alnum:]]", "_", outcome_name)
outcome_name_clean <- gsub("_+", "_", outcome_name_clean)
outcome_name_clean <- gsub("^_|_$", "", outcome_name_clean)
is_binary <- outcome_name_clean %in% binary_outcomes
# Read harmonized data
harmonized_data <- tryCatch({
fread(harmonized_file, nThread = 1)
}, error = function(e) {
cat(sprintf("Failed to read %s: %s\n", file_name, e$message))
return(NULL)
})
if (is.null(harmonized_data)) return(NULL)
# Perform MR analysis
result <- perform_mr(harmonized_data, seqid, outcome_name, clump_r2, clump_label, is_binary)
# Return result
return(result)
}
# Clean up parallel cluster
stopImplicitCluster()
# Filter non-null results
master_data <- master_data[!sapply(master_data, is.null)]
cat(sprintf("Total valid MR results: %d\n", length(master_data)))
# Create master file
today <- Sys.Date()
master_file <- file.path(mr_results_dir, sprintf("Master_MR_Results_aric_500KB_tss_%s.xlsx", today))
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
tabs <- c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out", "Clumped_Data")
master_data_combined <- setNames(vector("list", length(tabs)), tabs)
for (result in master_data) {
seqid <- result$seqid
outcome <- result$outcome
r2_value <- result$r2_value
for (tab in tabs) {
data <- result[[tolower(tab)]]
if (is.null(data) || (!is.data.frame(data) && is.null(data@Estimate)) || nrow(data) == 0) next
if (tab %in% c("MR-Lasso", "MR-ConMix") && !is.null(data@Estimate)) {
data <- if (tab == "MR-Lasso") {
data.frame(
id.exposure = seqid,
id.outcome = outcome,
Exposure = data@Exposure,
Outcome = data@Outcome,
Estimate = data@Estimate,
StdError = data@StdError,
CILower = data@CILower,
CIUpper = data@CIUpper,
Pvalue = data@Pvalue,
SNPs = data@SNPs,
Valid = data@Valid,
ValidSNPs = paste(data@ValidSNPs, collapse = ", ")
)
} else {
data.frame(
id.exposure = seqid,
id.outcome = outcome,
Exposure = data@Exposure,
Outcome = data@Outcome,
Estimate = data@Estimate,
Pvalue = data@Pvalue,
CILower = data@CILower,
CIUpper = data@CIUpper,
SNPs = data@SNPs
)
}
} else {
# Ensure id.exposure and id.outcome are added if not already present
if (!"id.exposure" %in% colnames(data)) data$id.exposure <- seqid
if (!"id.outcome" %in% colnames(data)) data$id.outcome <- outcome
# Reorder columns to put id.exposure and id.outcome first
data <- data %>% dplyr::select(id.exposure, id.outcome, everything())
}
data$SeqID <- seqid
data$Outcome <- outcome
data$Clump_R2 <- r2_value
master_data_combined[[tab]] <- bind_rows(master_data_combined[[tab]], data)
}
}
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || nrow(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = tabs,
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
for (tab in tabs) {
add_formatted_sheet(wb, tab, master_data_combined[[tab]], toc_data$Title[tabs == tab])
}
saveWorkbook(wb, master_file, overwrite = TRUE)
cat(sprintf("Master file saved to: %s\n", master_file))
# Clean up
sink()
cat(sprintf("MR analysis completed on %s\n", Sys.time()))
#!/usr/bin/env Rscript
# /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts
# Rscript 2025_05_18_chatgpt_parallel_merge_mr_results_aric_500kb.R
# Load necessary libraries
library(openxlsx)
library(dplyr)
library(data.table)
library(parallel)
library(foreach)
library(doParallel)
# Set directories
results_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/final_mr_res_aric_500kb"
subdirs <- c("r2_001", "r2_01", "r2_1")
# Find all Excel files in subdirectories
result_files <- unlist(lapply(subdirs, function(d) list.files(
file.path(results_dir, d),
pattern = "MR_Results_.*\\.xlsx$",
full.names = TRUE,
recursive = TRUE
)))
cat("Found", length(result_files), "Excel files in the subdirectories.\n")
# Set up parallel processing
num_cores <- min(4, detectCores())
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Process files in batches
batch_size <- 5000
batches <- split(result_files, ceiling(seq_along(result_files) / batch_size))
# Master data structure
master_data_combined <- list()
cat("Processing files in batches of", batch_size, "\n")
# Batch processing
for (batch_index in seq_along(batches)) {
batch <- batches[[batch_index]]
cat(sprintf("Processing batch %d of %d\n", batch_index, length(batches)))
batch_results <- foreach(file = batch, .packages = c("openxlsx", "data.table")) %dopar% {
tryCatch({
sheets <- getSheetNames(file)
data_list <- list()
for (sheet in sheets) {
data <- read.xlsx(file, sheet = sheet, startRow = 3)
if (!is.null(data) && nrow(data) > 0 && !any(grepl("No data available", data))) {
data$r2_clump <- basename(dirname(file))
data_list[[sheet]] <- data
}
}
data_list
}, error = function(e) {
cat(sprintf("Error reading file %s: %s\n", file, e$message))
NULL
})
}
# Combine results
for (result in batch_results) {
if (!is.null(result)) {
for (sheet in names(result)) {
if (is.null(master_data_combined[[sheet]])) {
master_data_combined[[sheet]] <- result[[sheet]]
} else {
master_data_combined[[sheet]] <- rbindlist(list(master_data_combined[[sheet]], result[[sheet]]), use.names = TRUE, fill = TRUE)
}
}
}
}
gc() # Clean up memory after each batch
}
stopCluster(cl)
# Save merged results
today <- format(Sys.Date(), "%Y-%m-%d")
output_file <- file.path(results_dir, paste0("chatgpt_merged_mr_results_", today, ".xlsx"))
wb <- createWorkbook()
for (sheet in names(master_data_combined)) {
addWorksheet(wb, sheet)
writeData(wb, sheet, master_data_combined[[sheet]])
}
saveWorkbook(wb, output_file, overwrite = TRUE)
cat("Merging complete. Master file saved to:", output_file, "\n")
#!/usr/bin/env Rscript
# Rscript annotate_mr_results_aric.R
suppressPackageStartupMessages({
library(openxlsx)
library(data.table)
library(dplyr)
})
# Paths
seqid_file <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/pqtls/seqid.txt"
input_file <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/final_mr_res_aric_500kb/chatgpt_merged_mr_results_2025-05-19.xlsx"
output_dir <- dirname(input_file)
today <- format(Sys.Date(), "%Y-%m-%d")
output_file <- file.path(output_dir, paste0("mr_aric_merged_annotated_", today, ".xlsx"))
# Read the seqid lookup (tab-delimited)
seqid <- fread(seqid_file, sep = "\t", header = TRUE, na.strings = c("", "NA"))
# Load sheet names from the merged workbook
sheets <- getSheetNames(input_file)
# Create a new workbook for output
wb_out <- createWorkbook()
for (sh in sheets) {
# Read the sheet
df <- read.xlsx(input_file, sheet = sh)
# If this is not the TOC sheet and id.exposure exists, annotate
if (toupper(sh) != "TOC" && "id.exposure" %in% names(df)) {
df <- df %>%
left_join(seqid, by = c("id.exposure" = "seqid_in_sample"))
}
# Add sheet and write data
addWorksheet(wb_out, sh)
writeData(wb_out, sh, df)
}
# Save the annotated workbook
saveWorkbook(wb_out, output_file, overwrite = TRUE)
cat("Annotated workbook written to:\n", output_file, "\n")
#!/usr/bin/env Rscript
# cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts
# Rscript filter_mr_results_for_soma.R
# Filters the annotated MR results to only those UniProt IDs present in soma.csv
suppressPackageStartupMessages({
library(openxlsx)
library(data.table)
})
# === Paths ===
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/final_mr_res_aric_500kb"
input_file <- file.path(base_dir, "mr_aric_merged_annotated_2025-05-21.xlsx")
soma_file <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data/soma.csv"
today <- format(Sys.Date(), "%Y-%m-%d")
output_file <- file.path(base_dir, paste0("aric_mr_results_zsuzsu_list_", today, ".xlsx"))
# === Read the list of UniProt IDs ===
soma_dt <- fread(soma_file, header = TRUE)
uniprot_list <- unique(soma_dt$UniProt)
# === Load sheet names ===
sheets <- getSheetNames(input_file)
# === Prepare output workbook ===
wb_out <- createWorkbook()
# === Process each sheet ===
for (sh in sheets) {
df <- read.xlsx(input_file, sheet = sh)
dt <- as.data.table(df)
if (toupper(sh) != "TOC" && "uniprot_id" %in% names(dt)) {
# keep only rows whose uniprot_id is in the soma list
dt <- dt[uniprot_id %in% uniprot_list]
}
# add filtered (or unfiltered TOC) sheet to new workbook
addWorksheet(wb_out, sh)
writeData(wb_out, sh, dt)
}
# === Save the filtered workbook ===
saveWorkbook(wb_out, output_file, overwrite = TRUE)
cat("Filtered results saved to:\n", output_file, "\n")
Same as above…
I had previously obtained the pQTL files in fall 2024 (see https://rpubs.com/YodaMendel/1243451 for an example of how to programmatically get files from UKB-PPP from Synapse). For obtaining the files and adding rsids, refer to:
- /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/ukbpp_chemokines_40.Rmd
- https://rpubs.com/YodaMendel/1268255
- https://rpubs.com/YodaMendel/1243451
I moved the files to the external drive and processed them: See:
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/scripts/mr_all_ukbppp_sbp.Rmd
See:
/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts/mr_ukbppp_aric_cardiometabolic.Rmd
cd /Volumes/LaCie/tahir/mr_ukbppp_bp
wget https://gwas.mrcieu.ac.uk/files/ieu-b-38/ieu-b-38.vcf.gz
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ieu-b-40.vcf.sh nohup ./format_ieu-b-40.vcf.sh > format_ieu-b-40.vcf.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-b-40.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ieu-b-40.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-b-40_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-b-40.vcf >> "$outfile"
/pub/databases/gwas/summary_statistics/GCST90271001-GCST90272000/GCST90271557/harmonised/#!/bin/bash
# Step 1: Navigate to the directory containing the file
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
# Step 2: Define the input and output file names
input_file="GCST90271557.h.tsv.gz"
output_file="GCST90271557_formatted_with_originals.tsv"
# Step 3: Use gzcat instead of zcat (compatible with macOS)
gzcat "$input_file" | head -n 1 | awk '{print $0"\tCHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL"}' > "$output_file"
# Step 4: Process the file, appending formatted columns without altering original data
gzcat "$input_file" | awk 'NR > 1 {
# Preserve original columns
chr = $1;
bp = $2;
ref = $4;
alt = $3;
eaf = $7;
beta = $5;
se = $6;
pval = $8;
rsid = ($10 != "" ? $10 : $9);
# Calculate LP (log10 p-value) only for valid p-values
lp = (pval ~ /^[0-9.]+$/ ? -log(pval) / log(10) : "NA");
# Print all original columns plus the formatted columns
print $0, chr, bp, rsid, ref, alt, eaf, beta, se, lp, pval
}' OFS='\t' >> "$output_file"
echo "Reformatted file with original columns saved to: $output_file"
```run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ukb-b-19393.sh nohup ./format_ukb-b-19393.sh > format_ukb-b-19393.log 2>&1 &
ps -p 96562 -o stat,etime
``` bash
# Step 1: Decompress ukb-b-19393.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ukb-b-19393.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ukb-b-19393_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ukb-b-19393.vcf >> "$outfile"
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ieu-a-302.sh nohup ./format_ieu-a-302.sh > format_ieu-a-302.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-a-302.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ieu-a-302.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-a-302_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-a-302.vcf >> "$outfile"
nano format_DM_bmi_adjusted_unadjusted.sh ./format_DM_bmi_adjusted_unadjusted.sh
#!/bin/bash
# Set the working directory
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
# Define the input files
input_files=("T2D_European.BMIunadjusted.txt" "T2D_European.BMIadjusted.txt")
for input_file in "${input_files[@]}"; do
# Generate output file name
output_file="${input_file%.txt}_formatted.tsv"
# Write the header with your specified columns
echo -e "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL\tN\tCASES\tCONTROLS\tOUTCOME" > "$output_file"
# Process the file, creating new columns only if they don't exist
awk 'NR > 1 {
chr = $3;
bp = $4;
snp = ($1 == "." ? $2 : $1);
ref = $6;
alt = $5;
eaf = $7;
beta = $9;
se = $10;
pval = $11;
lp = (pval > 0 ? -log(pval)/log(10) : "NA");
# Default values for missing columns
n = (NF >= 12 ? $12 : "NA");
cases = (NF >= 13 ? $13 : "NA");
controls = (NF >= 14 ? $14 : "NA");
outcome = (NF >= 15 ? $15 : (FILENAME ~ /BMIadjusted/ ? "T2D_BMIadjusted" : "T2D_BMIunadjusted"));
print chr, bp, snp, ref, alt, eaf, beta, se, lp, pval, n, cases, controls, outcome
}' OFS="\t" "$input_file" >> "$output_file"
echo "Formatted file saved to: $output_file"
done
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/scripts chmod +x format_ieu-a-7.sh nohup ./format_ieu-a-7.sh > format_ieu-a-7.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-a-7.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data
gunzip -k ieu-a-7.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-a-7_formatted.tsv"
# Write header
echo "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-a-7.vcf >> "$outfile"
Rscript mr_ukbppp_all_olink_cardiometabolic_harmonization.R
# Clear workspace
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(dplyr)
library(data.table)
library(foreach)
library(doParallel)
library(RhpcBLASctl)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit (128 GB)
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS for single-threaded performance
blas_set_num_threads(1)
# Set directories and files
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_olink_cardiometabolic"
pqtl_dir <- file.path("/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/fixed")
outcome_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data/formatted"
results_dir <- file.path(base_dir, "harmonization")
annotation_file <- file.path("/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/mart_clean_37_alias.txt")
# Create results directory
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
# Verify directories and annotation file
if (!dir.exists(pqtl_dir)) {
stop(sprintf("pQTL directory %s does not exist.", pqtl_dir))
}
if (!dir.exists(outcome_dir)) {
stop(sprintf("Outcome directory %s does not exist.", outcome_dir))
}
if (!file.exists(annotation_file)) {
stop(sprintf("Annotation file %s does not exist.", annotation_file))
}
# Load annotation file
gene_data <- fread(annotation_file, data.table = FALSE)
cat(sprintf("Loaded annotation file with %d genes\n", nrow(gene_data)))
# List pQTL and outcome files
pqtl_files <- list.files(pqtl_dir, pattern = "^[A-Z0-9]+_[A-Z0-9]+_OID[0-9]+_v1_.*\\.gz$", full.names = TRUE)
outcome_files <- list.files(outcome_dir, pattern = "_formatted.*\\.tsv$|_European.*\\.tsv$", full.names = TRUE)
cat("Total pQTL files to process:", length(pqtl_files), "\n")
cat("Total outcome files to process:", length(outcome_files), "\n")
# Create all pQTL-outcome pairs for parallel processing
task_pairs <- expand.grid(pqtl_file == pqtl_files, outcome_file = outcome_files, stringsAsFactors = FALSE)
cat("Total tasks (pQTL-outcome pairs):", nrow(task_pairs), "\n")
# Function to harmonize pQTL with an outcome
harmonize_pqtl <- function(pqtl_file, outcome_file, outcome_data, results_dir, gene_data) {
# Extract olink_id from pQTL file name
olink_id <- sub("_v1_.*$", "", basename(pqtl_file))
cat(sprintf("Processing %s...\n", olink_id))
# Get TSS and chromosome
parts <- strsplit(olink_id, "_")[[1]]
protein <- parts[1]
protein_data <- gene_data %>%
filter(tolower(`Gene name`) == tolower(protein) | tolower(`Gene Synonym`) == tolower(protein))
if (nrow(protein_data) == 0) {
cat(sprintf("No TSS data for %s in annotation file; skipping\n", olink_id))
return(NULL)
}
protein_chrom <- as.character(protein_data$`Chromosome/scaffold name`[1])
protein_tss <- as.numeric(protein_data$`Transcription start site (TSS)`[1])
window_size <- 500000
tss_start <- protein_tss - window_size
tss_end <- protein_tss + window_size
# Format exposure data
exp_dat <- fread(pqtl_file) %>%
filter(
CHROM == protein_chrom,
POS19 >= tss_start,
POS19 <= tss_end,
!is.na(rsid),
grepl("^rs", rsid)
) %>%
dplyr::select(
SNP = rsid,
chr.exposure = CHROM,
pos.exposure = POS19,
effect_allele.exposure = ALLELE1,
other_allele.exposure = ALLELE0,
eaf.exposure = A1FREQ,
beta.exposure = BETA,
se.exposure = SE,
pval.exposure = P,
samplesize.exposure = N
) %>%
mutate(
exposure = olink_id,
id.exposure = olink_id,
chr.exposure = as.character(chr.exposure)
)
# Extract outcome name from pre-loaded outcome data
outcome_name <- unique(outcome_data$OUTCOME)
if (length(outcome_name) != 1) {
cat(sprintf("Multiple or no OUTCOME values in %s; skipping\n", basename(outcome_file)))
return(NULL)
}
# Format outcome data
out_dat <- outcome_data %>%
filter(
CHR == protein_chrom,
BP >= tss_start,
BP <= tss_end,
!is.na(SNP),
grepl("^rs", SNP)
) %>%
dplyr::select(
SNP = SNP,
chr.outcome = CHR,
pos.outcome = BP,
effect_allele.outcome = ALT,
other_allele.outcome = REF,
eaf.outcome = EAF,
beta.outcome = BETA,
se.outcome = SE,
pval.outcome = PVAL,
samplesize.outcome = N
) %>%
mutate(
outcome = outcome_name,
id.outcome = outcome_name,
chr.outcome = as.character(chr.outcome)
)
# Harmonize data
dat <- tryCatch({
harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat, action = 1)
}, error = function(e) {
cat(sprintf("Harmonization failed for %s with %s: %s\n", olink_id, outcome_name, e$message))
return(NULL)
})
if (is.null(dat) || nrow(dat) == 0) {
cat(sprintf("No harmonized data for %s with %s; skipping\n", olink_id, outcome_name))
return(NULL)
}
# Clean outcome_name for file naming
outcome_name_file <- gsub("[^[:alnum:]]", "_", outcome_name)
outcome_name_file <- gsub("_+", "_", outcome_name_file)
outcome_name_file <- gsub("^_|_$", "", outcome_name_file)
# Save harmonized data
harmonized_file <- file.path(results_dir, sprintf("tss_harmonized_%s_%s.csv", olink_id, outcome_name_file))
fwrite(dat, harmonized_file, row.names = FALSE)
cat(sprintf("Harmonized data saved for %s with %s: %s\n", olink_id, outcome_name, harmonized_file))
return(NULL)
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Pre-load outcome data to avoid repeated reads
outcome_data_list <- lapply(outcome_files, function(file) {
cat(sprintf("Pre-loading outcome file: %s\n", basename(file)))
fread(file, nThread = 1)
})
names(outcome_data_list) <- outcome_files
# Process all pQTL-outcome pairs in parallel
foreach(i = 1:nrow(task_pairs), .packages = c("dplyr", "data.table", "TwoSampleMR")) %dopar% {
pqtl_file <- task_pairs$pqtl_file[i]
outcome_file <- task_pairs$outcome_file[i]
outcome_data <- outcome_data_list[[outcome_file]]
harmonize_pqtl(pqtl_file, outcome_file, outcome_data, results_dir, gene_data)
}
# Clean up
stopImplicitCluster()
cat("Harmonization process completed for all pQTLs and outcomes.\n")
Rscript 2025_05_20_parallel_clump_mr_olink_500kb_8_outcomes.R
NB: Recopy the code below to re-run the analysis if needed. Don’t just run the Rscript. This is because the original timed out and I modified it to pick up where it left off…
#!/usr/bin/env Rscript
# /Volumes/LaCie/tahir/mr_ukbppp_olink_cardiometabolic/scripts
# Rscript 2025_05_20_parallel_clump_mr_olink_500kb_8_outcomes.R
# Clear workspace
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
library(foreach)
library(doParallel)
# Define %||% operator
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit (64 GB)
Sys.setenv("R_MAX_VSIZE" = 64000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS
blas_set_num_threads(1)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_olink_cardiometabolic"
harmonized_dir <- file.path(base_dir, "harmonization")
mr_results_dir <- file.path(base_dir, "mr_results_olink_500kb")
if (!dir.exists(harmonized_dir)) stop("Harmonized directory does not exist:", harmonized_dir)
if (!dir.exists(mr_results_dir)) dir.create(mr_results_dir, recursive = TRUE)
# LD clumping thresholds
clump_r2_values <- c(0.001, 0.01, 0.1)
clump_labels <- c("r2_001", "r2_01", "r2_1")
# Define binary outcomes
binary_outcomes <- c("Coronary_Heart_Disease_CARDIoGRAMplusC4D", "T2D_BMIadjusted_ExTexT2D", "T2D_BMIunadjusted_ExTexT2D")
# Set up logging
log_file <- file.path(base_dir, "mr_analysis_olink_500KB_tss.log")
sink(log_file, append = TRUE, split = TRUE)
cat(sprintf("Starting MR analysis on %s\n", Sys.time()))
# List harmonized files
harmonized_files <- list.files(harmonized_dir, pattern = "^tss_harmonized_.*\\.csv$", full.names = TRUE)
cat(sprintf("Total harmonized files found: %d\n", length(harmonized_files)))
if (length(harmonized_files) == 0) {
cat("No harmonized files found in", harmonized_dir, "\n")
sink()
stop("No harmonized files to process")
}
cat("Harmonized file examples:", paste(basename(head(harmonized_files, 5)), collapse = ", "), "\n")
# Create task list
task_pairs <- expand.grid(
harmonized_file = harmonized_files,
clump_r2_index = seq_along(clump_r2_values),
stringsAsFactors = FALSE
)
cat("Total tasks (pQTL-outcome-r2 pairs):", nrow(task_pairs), "\n")
# Function to perform MR analysis
perform_mr <- function(harmonized_data, protein_id, outcome_name, clump_r2, clump_label, is_binary) {
today <- Sys.Date()
output_dir <- file.path(mr_results_dir, clump_label)
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
excel_file <- file.path(output_dir, sprintf("MR_Results_%s_%s_%s_500KB_tss_%s.xlsx", protein_id, outcome_name, clump_label, today))
# Skip if Excel file exists
if (file.exists(excel_file)) {
cat(sprintf("Skipping %s with %s (r2 = %s): Existing Excel file found\n", protein_id, outcome_name, clump_r2))
return(NULL)
}
# Use pre-loaded harmonized data
dat <- harmonized_data
cat(sprintf("Processing %s with %s (r2 = %s): %d rows\n", protein_id, outcome_name, clump_r2, nrow(dat)))
if (nrow(dat) == 0) {
cat(sprintf("No data for %s with %s; skipping\n", protein_id, outcome_name))
return(NULL)
}
# Filter instruments
instruments <- dat %>%
filter(mr_keep == TRUE, pval.exposure < 5e-6) %>%
mutate(
rsid = SNP,
pval = pval.exposure,
id = sprintf("%s exposure", protein_id),
F_stat = (beta.exposure / se.exposure)^2
) %>%
filter(F_stat > 10)
cat(sprintf("Instruments for %s with %s: %d\n", protein_id, outcome_name, nrow(instruments)))
if (nrow(instruments) == 0) {
cat(sprintf("No valid instruments for %s with %s; skipping\n", protein_id, outcome_name))
return(NULL)
}
# Steiger filtering
instruments <- tryCatch({
steiger_filtering(instruments)
}, error = function(e) {
cat(sprintf("Steiger filtering failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
return(NULL)
})
if (is.null(instruments) || !"steiger_dir" %in% colnames(instruments) || nrow(instruments) == 0) {
cat(sprintf("Steiger filtering failed or no instruments for %s with %s; skipping\n", protein_id, outcome_name))
return(NULL)
}
instruments <- instruments %>% filter(steiger_dir == TRUE)
cat(sprintf("Instruments after Steiger filtering: %d\n", nrow(instruments)))
if (nrow(instruments) == 0) return(NULL)
# LD clumping
clumped_dat <- NULL
tryCatch({
clumped <- ld_clump(
dplyr::tibble(rsid = instruments$rsid, pval = instruments$pval, id = instruments$id),
plink_bin = genetics.binaRies::get_plink_binary(),
bfile = "/Users/charleenadams/1000G_bfiles/EUR/EUR", # REPLACE WITH YOUR CORRECT bfile PATH
clump_r2 = clump_r2,
clump_kb = 500
)
clumped_dat <- instruments %>% filter(SNP %in% clumped$rsid)
cat(sprintf("SNPs after clumping (r2 = %s): %d\n", clump_r2, nrow(clumped_dat)))
}, error = function(e) {
cat(sprintf("LD clumping failed for %s with %s (r2 = %s): %s\n", protein_id, outcome_name, clump_r2, e$message))
return(NULL)
})
if (is.null(clumped_dat) || nrow(clumped_dat) == 0) {
cat(sprintf("No SNPs after clumping for %s with %s (r2 = %s); skipping\n", protein_id, outcome_name, clump_r2))
return(NULL)
}
# Perform MR analyses
ivw_result <- tryCatch(mr(clumped_dat, method_list = "mr_ivw"), error = function(e) {
cat(sprintf("IVW failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
egger_result <- if (nrow(clumped_dat) >= 3) {
tryCatch(mr(clumped_dat, method_list = "mr_egger_regression"), error = function(e) {
cat(sprintf("MR-Egger failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
} else NULL
weighted_median_result <- tryCatch(mr(clumped_dat, method_list = "mr_weighted_median"), error = function(e) {
cat(sprintf("Weighted Median failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
mr_input <- tryCatch({
mr_input(
bx = clumped_dat$beta.exposure,
bxse = clumped_dat$se.exposure,
by = clumped_dat$beta.outcome,
byse = clumped_dat$se.outcome,
exposure = protein_id,
outcome = outcome_name,
snps = clumped_dat$SNP
)
}, error = function(e) {
cat(sprintf("MR input failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
return(NULL)
})
if (is.null(mr_input)) return(NULL)
lasso_result <- tryCatch(mr_lasso(mr_input), error = function(e) {
cat(sprintf("MR-Lasso failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
conmix_result <- tryCatch(mr_conmix(mr_input), error = function(e) {
cat(sprintf("MR-ConMix failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
heterogeneity_result <- tryCatch(mr_heterogeneity(clumped_dat), error = function(e) {
cat(sprintf("Heterogeneity test failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
pleiotropy_result <- if (nrow(clumped_dat) >= 3) tryCatch(mr_pleiotropy_test(clumped_dat), error = function(e) {
cat(sprintf("Pleiotropy test failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
}) else NULL
loo_result <- tryCatch(mr_leaveoneout(clumped_dat), error = function(e) {
cat(sprintf("Leave-one-out failed for %s with %s: %s\n", protein_id, outcome_name, e$message))
NULL
})
# Calculate Wald ratios with id.exposure and id.outcome
wald_ratios <- clumped_dat %>%
mutate(
wald_beta = beta.outcome / beta.exposure,
wald_se = sqrt((se.outcome^2 / beta.exposure^2) + ((beta.outcome^2 * se.exposure^2) / (beta.exposure^4))),
pval = 2 * pnorm(abs(wald_beta / wald_se), lower.tail = FALSE),
method = paste("Wald Ratio:", SNP),
id.exposure = protein_id,
id.outcome = outcome_name
) %>%
dplyr::select(id.exposure, id.outcome, SNP, wald_beta, wald_se, pval, method)
# Save outputs to Excel
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
add_formatted_sheet <- function(wb, shell_name, data, title) {
if (is.null(data) || !is.data.frame(data) || nrow(data) == 0 || ncol(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out", "Clumped_Data"),
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
add_formatted_sheet(wb, "IVW", ivw_result, "Inverse Variance Weighted Results")
add_formatted_sheet(wb, "MR-Egger", egger_result, "MR-Egger Results")
add_formatted_sheet(wb, "Weighted_Median", weighted_median_result, "Weighted Median Results")
if (!is.null(lasso_result)) {
lasso_df <- data.frame(
id.exposure = protein_id,
id.outcome = outcome_name,
Exposure = lasso_result@Exposure,
Outcome = lasso_result@Outcome,
Estimate = lasso_result@Estimate,
StdError = lasso_result@StdError,
CILower = lasso_result@CILower,
CIUpper = lasso_result@CIUpper,
Pvalue = lasso_result@Pvalue,
SNPs = lasso_result@SNPs,
Valid = lasso_result@Valid,
ValidSNPs = paste(lasso_result@ValidSNPs, collapse = ", ")
)
add_formatted_sheet(wb, "MR-Lasso", lasso_df, "MR-Lasso Results")
}
if (!is.null(conmix_result)) {
conmix_df <- data.frame(
id.exposure = protein_id,
id.outcome = outcome_name,
Exposure = conmix_result@Exposure,
Outcome = conmix_result@Outcome,
Estimate = conmix_result@Estimate,
Pvalue = conmix_result@Pvalue,
CILower = conmix_result@CILower,
CIUpper = conmix_result@CIUpper,
SNPs = conmix_result@SNPs
)
add_formatted_sheet(wb, "MR-ConMix", conmix_df, "MR-ConMix Results")
}
add_formatted_sheet(wb, "Wald_Ratios", wald_ratios, "Wald Ratio Tests")
add_formatted_sheet(wb, "Heterogeneity_Test", heterogeneity_result, "Heterogeneity Test Results")
add_formatted_sheet(wb, "Pleiotropy_Test", pleiotropy_result, "Pleiotropy Test Results")
add_formatted_sheet(wb, "Leave_One_Out", loo_result, "Leave-One-Out Analysis")
add_formatted_sheet(wb, "Clumped_Data", clumped_dat, "Clumped SNPs Data")
saveWorkbook(wb, excel_file, overwrite = FALSE)
cat(sprintf("Results saved for %s with %s (r2 = %s): %s\n", protein_id, outcome_name, clump_r2, excel_file))
return(list(
ivw = ivw_result, egger = egger_result, weighted_median = weighted_median_result,
lasso = lasso_result, conmix = conmix_result, wald_ratios = wald_ratios,
heterogeneity = heterogeneity_result, pleiotropy = pleiotropy_result,
loo = loo_result, clumped = clumped_dat,
protein_id = protein_id, outcome = outcome_name, r2_value = clump_label
))
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Process tasks in parallel
master_data <- foreach(i = 1:nrow(task_pairs), .packages = c("dplyr", "tidyr", "data.table", "TwoSampleMR", "MendelianRandomization", "ieugwasr", "openxlsx", "genetics.binaRies"), .combine = c) %dopar% {
harmonized_file <- task_pairs$harmonized_file[i]
clump_r2_index <- task_pairs$clump_r2_index[i]
clump_r2 <- clump_r2_values[clump_r2_index]
clump_label <- clump_labels[clump_r2_index]
# Extract protein_id and outcome
file_name <- basename(harmonized_file)
pattern <- "^tss_harmonized_([^_]+_[^_]+_[^_]+)_(.+)\\.csv$"
protein_id <- tryCatch(sub(pattern, "\\1", file_name), error = function(e) NULL)
outcome_name <- tryCatch(sub(pattern, "\\2", file_name), error = function(e) NULL)
if (is.null(protein_id) || is.null(outcome_name)) {
cat(sprintf("Failed to parse %s; skipping\n", file_name))
return(NULL)
}
# Clean outcome name for binary detection
outcome_name_clean <- outcome_name
is_binary <- outcome_name_clean %in% binary_outcomes
# Read harmonized data
harmonized_data <- tryCatch({
fread(harmonized_file, nThread = 1)
}, error = function(e) {
cat(sprintf("Failed to read %s: %s\n", file_name, e$message))
return(NULL)
})
if (is.null(harmonized_data)) return(NULL)
# Perform MR analysis
result <- perform_mr(harmonized_data, protein_id, outcome_name, clump_r2, clump_label, is_binary)
# Return result
return(result)
}
# Clean up parallel cluster
stopImplicitCluster()
# Filter non-null results
master_data <- master_data[!sapply(master_data, is.null)]
cat(sprintf("Total valid MR results: %d\n", length(master_data)))
# Create master file
today <- Sys.Date()
master_file <- file.path(mr_results_dir, sprintf("Master_MR_Results_olink_500KB_tss_%s.xlsx", today))
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
tabs <- c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out", "Clumped_Data")
master_data_combined <- setNames(vector("list", length(tabs)), tabs)
for (result in master_data) {
protein_id <- result$protein_id
outcome <- result$outcome
r2_value <- result$r2_value
for (tab in tabs) {
data <- result[[tolower(tab)]]
if (is.null(data) || (!is.data.frame(data) && is.null(data@Estimate)) || nrow(data) == 0) next
if (tab %in% c("MR-Lasso", "MR-ConMix") && !is.null(data@Estimate)) {
data <- if (tab == "MR-Lasso") {
data.frame(
id.exposure = protein_id,
id.outcome = outcome,
Exposure = data@Exposure,
Outcome = data@Outcome,
Estimate = data@Estimate,
StdError = data@StdError,
CILower = data@CILower,
CIUpper = data@CILower,
Pvalue = data@Pvalue,
SNPs = data@SNPs,
Valid = data@Valid,
ValidSNPs = paste(data@ValidSNPs, collapse = ", ")
)
} else {
data.frame(
id.exposure = protein_id,
id.outcome = outcome,
Exposure = data@Exposure,
Outcome = data@Outcome,
Estimate = data@Estimate,
Pvalue = data@Pvalue,
CILower = data@CILower,
CIUpper = data@CIUpper,
SNPs = data@SNPs
)
}
} else {
# Ensure id.exposure and id.outcome are added if not already present
if (!"id.exposure" %in% colnames(data)) data$id.exposure <- protein_id
if (!"id.outcome" %in% colnames(data)) data$id.outcome <- outcome
# Reorder columns to put id.exposure and id.outcome first
data <- data %>% dplyr::select(id.exposure, id.outcome, everything())
}
data$ProteinID <- protein_id
data$Outcome <- outcome
data$Clump_R2 <- r2_value
master_data_combined[[tab]] <- bind_rows(master_data_combined[[tab]], data)
}
}
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || nrow(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = tabs,
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
for (tab in tabs) {
add_formatted_sheet(wb, tab, master_data_combined[[tab]], toc_data$Title[tabs == tab])
}
saveWorkbook(wb, master_file, overwrite = TRUE)
cat(sprintf("Master file saved to: %s\n", master_file))
# Clean up
sink()
cat(sprintf("MR analysis completed on %s\n", Sys.time()))
#!/usr/bin/env Rscript
# /Volumes/LaCie/tahir/mr_ukbppp_olink_cardiometabolic/scripts
# Rscript olink_merge_try4.R
# Load necessary libraries
library(openxlsx)
library(dplyr)
library(data.table)
library(parallel)
library(foreach)
library(doParallel)
# Set directories
results_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_olink_cardiometabolic/mr_results_olink_500kb"
subdirs <- c("r2_001", "r2_01", "r2_1")
# Find all Excel files in subdirectories
result_files <- unlist(lapply(subdirs, function(d) list.files(
file.path(results_dir, d),
pattern = "MR_Results_.*\\.xlsx$",
full.names = TRUE,
recursive = TRUE
)))
cat("Found", length(result_files), "Excel files in the subdirectories.\n")
# Set up parallel processing
num_cores <- min(4, detectCores())
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Process files in batches
batch_size <- 5000
batches <- split(result_files, ceiling(seq_along(result_files) / batch_size))
# Master data structure
master_data_combined <- list()
cat("Processing files in batches of", batch_size, "\n")
# Batch processing
for (batch_index in seq_along(batches)) {
batch <- batches[[batch_index]]
cat(sprintf("Processing batch %d of %d\n", batch_index, length(batches)))
batch_results <- foreach(file = batch, .packages = c("openxlsx", "data.table")) %dopar% {
tryCatch({
sheets <- getSheetNames(file)
data_list <- list()
for (sheet in sheets) {
data <- read.xlsx(file, sheet = sheet, startRow = 3)
if (!is.null(data) && nrow(data) > 0 && !any(grepl("No data available", data))) {
data$r2_clump <- basename(dirname(file))
data_list[[sheet]] <- data
}
}
data_list
}, error = function(e) {
cat(sprintf("Error reading file %s: %s\n", file, e$message))
NULL
})
}
# Combine results
for (result in batch_results) {
if (!is.null(result)) {
for (sheet in names(result)) {
if (is.null(master_data_combined[[sheet]])) {
master_data_combined[[sheet]] <- result[[sheet]]
} else {
master_data_combined[[sheet]] <- rbindlist(list(master_data_combined[[sheet]], result[[sheet]]), use.names = TRUE, fill = TRUE)
}
}
}
}
gc() # Clean up memory after each batch
}
stopCluster(cl)
# Save merged results
today <- format(Sys.Date(), "%Y-%m-%d")
output_file <- file.path(results_dir, paste0("OLINK_chatgpt_merged_mr_results_", today, ".xlsx"))
wb <- createWorkbook()
for (sheet in names(master_data_combined)) {
addWorksheet(wb, sheet)
writeData(wb, sheet, master_data_combined[[sheet]])
}
saveWorkbook(wb, output_file, overwrite = TRUE)
cat("Merging complete. Master file saved to:", output_file, "\n")
#!/usr/bin/env Rscript
# Rscript filter_olink_mr_by_uniprot.R
# Filters the merged OLINK MR results to only those UniProt IDs in olink.csv
suppressPackageStartupMessages({
library(openxlsx)
library(data.table)
})
# === Paths ===
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_olink_cardiometabolic/mr_results_olink_500kb"
input_xlsx <- file.path(base_dir, "OLINK_chatgpt_merged_mr_results_2025-05-21.xlsx")
olink_csv <- "/Volumes/LaCie/tahir/mr_ukbppp_aric_cardiometabolic/data/olink.csv"
today <- format(Sys.Date(), "%Y-%m-%d")
output_xlsx <- file.path(
base_dir,
paste0("olink_mr_results_zsuzsu_list_", today, ".xlsx")
)
# === Read the list of UniProt IDs from olink.csv ===
olink_dt <- fread(olink_csv, header = TRUE)
uniprot_list <- unique(olink_dt$UniProt)
# === Get sheet names ===
sheets <- getSheetNames(input_xlsx)
# === Prepare output workbook ===
wb_out <- createWorkbook()
# === Process each sheet ===
for (sh in sheets) {
message("Processing sheet: ", sh)
df <- read.xlsx(input_xlsx, sheet = sh)
dt <- as.data.table(df)
if (toupper(sh) != "TOC" && "id.exposure" %in% names(dt)) {
# extract the UniProt segment (second underscore-delimited field)
dt[, uniprot_exposure := tstrsplit(id.exposure, "_", keep = 2)]
# filter by the olink list
dt <- dt[uniprot_exposure %in% uniprot_list]
# drop the helper column
dt[, uniprot_exposure := NULL]
}
# write (filtered or unfiltered TOC) to the new workbook
addWorksheet(wb_out, sh)
writeData(wb_out, sh, dt)
}
# === Save the filtered workbook ===
saveWorkbook(wb_out, output_xlsx, overwrite = TRUE)
message("Filtered workbook written to: ", output_xlsx)
https://yodamendel.shinyapps.io/aric_olink_app/
#!/usr/bin/env Rscript
# setwd("/Volumes/LaCie/tahir/aric_olink_app") # Do before running
# app.R
# Shiny app to display two Excel files as one tabset (excluding TOC) with side-by-side logo and download links
# Run with:
# library(shiny)
# runApp()
#
# Deploy with:
# library(rsconnect)
# rsconnect::deployApp(
# appDir = "/Volumes/LaCie/tahir/aric_olink_app",
# appName = "aric_olink_app",
# appFiles = c("app.R", "shiny_error.log", ".rsconnectignore",
# "www/BIDMC_HMS_Stacked-LockUp.png",
# "www/zsu_zsu_set_aric_8_outcomes_2025-05-21.xlsx",
# "www/zsu_zsu_set_olink_8_outcomes_2025-05-21.xlsx"),
# lint = FALSE,
# forceUpdate = TRUE,
# logLevel = "verbose"
# )
#
# Before deploying, clean hidden macOS files and create .rsconnectignore:
# cd /Volumes/LaCie/tahir/aric_olink_app
# find . -name "._*" -delete
# rm -rf rsconnect
# echo -e "._*\ndownloads/\nrsconnect/" > .rsconnectignore
# export COPYFILE_DISABLE=1
# Install required packages if not already installed
if (!require(shiny)) install.packages("shiny")
if (!require(shinydashboard)) install.packages("shinydashboard")
if (!require(DT)) install.packages("DT")
if (!require(readxl)) install.packages("readxl")
if (!require(dplyr)) install.packages("dplyr")
if (!require(shinyWidgets)) install.packages("shinyWidgets")
# Load libraries
library(shiny)
library(shinydashboard)
library(DT)
library(readxl)
library(dplyr)
library(shinyWidgets)
# Define UI
ui <- fluidPage(
# Custom CSS for professional styling with taupe color scheme
tags$head(
tags$style(HTML("
body {
font-family: 'Arial', sans-serif;
background-color: #F5F4EF; /* Soft Beige Background */
color: #333333; /* Dark Gray Text */
}
#main_title {
color: #333333; /* Dark Gray */
font-weight: bold;
font-size: 2.5em;
margin-top: 20px;
}
#lab_link {
color: #483C32; /* Taupe */
font-size: 1.2em;
text-decoration: none;
transition: color 0.3s;
}
#lab_link:hover {
color: #3A2F26; /* Darker Taupe */
text-decoration: underline;
}
.download-box {
background-color: #FFFFFF; /* White */
border: 1px solid #DCDCDC; /* Light Gray Border */
border-radius: 8px;
padding: 20px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
text-align: center;
width: 100%; /* Full width within column */
}
.download-link {
color: #483C32; /* Taupe */
text-decoration: none;
font-weight: bold;
transition: color 0.3s;
display: block; /* One link per line */
margin: 10px 0; /* Spacing between links */
}
.download-link:hover {
color: #3A2F26; /* Darker Taupe */
text-decoration: underline;
}
.data-table-container {
background-color: #FFFFFF; /* White */
border-radius: 8px;
padding: 20px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
margin-bottom: 20px;
}
#logo_container {
text-align: center;
margin: 20px 0; /* Consistent margin */
}
#logo_image {
max-width: 300px;
height: auto;
}
.nav-tabs > li > a {
background-color: #EDEAE5; /* Light Beige */
color: #333333; /* Dark Gray */
border-radius: 5px 5px 0 0;
margin-right: 5px;
font-weight: bold;
}
.nav-tabs > li.active > a {
background-color: #483C32; /* Taupe */
color: white;
}
.error-message {
color: #D9534F; /* Soft Red */
font-weight: bold;
text-align: center;
margin: 20px;
}
"))
),
# Header Title & Lab Link
div(
style = "text-align: center; margin-bottom: 10px;",
h1(HTML("Cis-MR of ARIC & Olink Proteins with Eight Outcomes"), id = "main_title")
),
div(
style = "text-align: center; margin-bottom: 30px;",
tags$a(
href = "https://www.bidmc.org/research/research-by-department/medicine/cardiovascular-medicine/personal-genomics-and-cardiometabolic-disease",
target = "_blank",
"🔬 Lab",
id = "lab_link"
),
tags$br(),
tags$a(
href = "https://rpubs.com/YodaMendel/1313301",
target = "_blank",
"📝 Notes and Code",
id = "lab_link"
)
),
# Main content with tabbed DataTables
fluidRow(
column(
width = 12,
div(
class = "data-table-container",
uiOutput("excelTabs"),
textOutput("errorMessage")
)
)
),
# Side-by-side Logo and Download Links
fluidRow(
# Logo Column
column(
width = 6,
div(
id = "logo_container",
tags$a(
href = "https://www.bidmc.org/research/research-by-department/medicine/cardiovascular-medicine/personal-genomics-and-cardiometabolic-disease",
target = "_blank",
tags$img(
src = "BIDMC_HMS_Stacked-LockUp.png",
id = "logo_image",
alt = "Beth Israel Deaconess Medical Center Logo"
)
)
)
),
# Download Links Column
column(
width = 6,
div(
class = "download-box",
h4("Download All Results", style = "color: #333333;"),
tags$p("Select a file to download:"),
div(
tags$a(href = "https://docs.google.com/spreadsheets/d/1z-brptCcLjgsJz_Sedh3LJVke4NiiMqa", "ARIC Zsu-Zsu Set", class = "download-link", target = "_blank"),
tags$a(href = "https://drive.google.com/uc?export=download&id=1CuQSqmngxvhgWQThJtLdsQtxkn62mm1s", "ARIC Full Results", class = "download-link", target = "_blank"),
tags$a(href = "https://docs.google.com/spreadsheets/d/15H7BbwXSUiqhnp2M9DZImDpKT00sP1Oz", "Olink Zsu-Zsu Set", class = "download-link", target = "_blank"),
tags$a(href = "https://drive.google.com/uc?export=download&id=19p81IdmViEuN8wAlOhv1cmoF2EU-ZYHH", "Olink Full Results", class = "download-link", target = "_blank")
)
)
)
)
)
# Define Server
server <- function(input, output, session) {
# Paths to Excel files for display
excel_files <- c(
file.path("www", "zsu_zsu_set_aric_8_outcomes_2025-05-21.xlsx"),
file.path("www", "zsu_zsu_set_olink_8_outcomes_2025-05-21.xlsx")
)
# Path to logo file
logo_file <- file.path("www", "BIDMC_HMS_Stacked-LockUp.png")
# Log file for debugging
log_file <- "shiny_error.log"
log_message <- function(msg) {
cat(paste0(Sys.time(), ": ", msg, "\n"), file = log_file, append = TRUE)
}
log_message("Starting Shiny app")
# Check if files exist
output$errorMessage <- renderText({
errors <- c()
for (f in excel_files) {
if (!file.exists(f)) {
msg <- paste("Error: Excel file not found at", f)
log_message(msg)
errors <- c(errors, msg)
}
}
if (!file.exists(logo_file)) {
msg <- paste("Error: Logo file not found at", logo_file)
log_message(msg)
errors <- c(errors, msg)
}
if (length(errors) > 0) {
return(paste(errors, collapse = "\n"))
}
log_message("All required files found")
""
})
# Read Excel sheet names from both files, excluding TOC
sheet_names <- reactive({
req(all(file.exists(excel_files)))
tryCatch({
all_sheets <- lapply(seq_along(excel_files), function(i) {
sheets <- excel_sheets(excel_files[i])
# Exclude TOC sheet (case-insensitive)
sheets <- sheets[!tolower(sheets) == "toc"]
if (length(sheets) == 0) {
msg <- paste("Error: No non-TOC sheets found in", excel_files[i])
log_message(msg)
return(NULL)
}
# Prefix sheet names with file identifier to avoid duplicates
prefix <- ifelse(i == 1, "ARIC_", "Olink_")
paste0(prefix, sheets)
})
valid_sheets <- unlist(all_sheets)
if (length(valid_sheets) == 0) {
msg <- "Error: No valid non-TOC sheets found in Excel files"
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
}
log_message(paste("Found", length(valid_sheets), "non-TOC sheets:", paste(valid_sheets, collapse = ", ")))
valid_sheets
}, error = function(e) {
msg <- paste("Error reading Excel sheets:", e$message)
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
})
})
# Reactive value to store data frames with rounded columns
excel_data <- reactive({
req(sheet_names())
tryCatch({
data_list <- lapply(seq_along(excel_files), function(i) {
sheets <- excel_sheets(excel_files[i])
# Exclude TOC sheet
sheets <- sheets[!tolower(sheets) == "toc"]
prefix <- ifelse(i == 1, "ARIC_", "Olink_")
lapply(sheets, function(sheet) {
tryCatch({
df <- read_excel(excel_files[i], sheet = sheet)
if (nrow(df) == 0 || ncol(df) == 0) {
log_message(paste("Sheet", sheet, "in", excel_files[i], "is empty"))
return(NULL)
}
# Round columns containing 'b', 'se', 'stderror', 'p', 'pval', 'Estimate', 'CILower', 'CIUpper', 'Pvalue' (case-insensitive) to 4 decimal places
cols_to_round <- grep("^(egger_intercept|q_df|q_pval|q|b|se|stderror|p|pval|Estimate|CILower|CIUpper|Pvalue)$", names(df), ignore.case = TRUE, value = TRUE)
if (length(cols_to_round) > 0) {
df <- df %>%
mutate(across(all_of(cols_to_round), ~ round(as.numeric(.), 4)))
}
log_message(paste("Read sheet", sheet, "from", excel_files[i], "with", nrow(df), "rows and", ncol(df), "columns"))
df
}, error = function(e) {
log_message(paste("Error reading sheet", sheet, "from", excel_files[i], ":", e$message))
return(NULL)
})
})
})
# Flatten and name the list
data_list <- unlist(data_list, recursive = FALSE)
data_list <- data_list[!sapply(data_list, is.null)]
if (length(data_list) == 0) {
msg <- "Error: No non-TOC sheets contain valid data"
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
}
names(data_list) <- sheet_names()[!sapply(data_list, is.null)]
log_message(paste("Loaded", length(data_list), "non-TOC sheets with valid data"))
data_list
}, error = function(e) {
msg <- paste("Error reading Excel data:", e$message)
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
})
})
# Render tabset for Excel sheets
output$excelTabs <- renderUI({
req(excel_data())
data <- excel_data()
if (is.null(data) || length(data) == 0) {
log_message("No data available to display")
return(div(class = "error-message", "No valid non-TOC data sheets available to display."))
}
valid_sheets <- names(data)
tabs <- lapply(seq_along(valid_sheets), function(i) {
tabPanel(
title = valid_sheets[i],
DTOutput(paste0("table_", i))
)
})
log_message(paste("Rendering tabset with", length(tabs), "tabs"))
do.call(tabsetPanel, tabs)
})
# Render DataTables for each sheet
observe({
data <- excel_data()
req(data)
valid_sheets <- names(data)
lapply(seq_along(valid_sheets), function(i) {
output[[paste0("table_", i)]] <- renderDT({
tryCatch({
log_message(paste("Rendering table for sheet", valid_sheets[i]))
datatable(
data[[i]],
filter = "top",
options = list(
pageLength = 10,
autoWidth = TRUE,
scrollX = TRUE
),
class = 'cell-border stripe'
)
}, error = function(e) {
msg <- paste("Error rendering table for sheet", valid_sheets[i], ":", e$message)
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
})
})
})
})
}
# Run the Shiny App
shinyApp(ui = ui, server = server)