rsync -aWv --progress /Users/charleenadams/ukbppp/ /Volumes/LaCie/ukbppp/
Processed files located at:
/Volumes/LaCie/ukbppp/coloc_clin_mol_extracted/
For the previous batches of ~200 proteins, I’d merged chromosome files, but that is unnecessary and not feasible for all 3000.
cd /Volumes/LaCie/tahir/mr_ukbppp_all_sbp/scripts nohup ./copy_tss_chr_fast.sh > copy_tss_chr_fast.log 2>&1 &
/Volumes/LaCie/ukbppp/coloc_clin_mol_extracted/TSS_chr
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'
# ——— CONFIG ———
BASE_DIR="/Volumes/LaCie/ukbppp/coloc_clin_mol_extracted"
ENSEMBL_FILE="/Volumes/LaCie/tahir/mr_ukbppp_bp/mart_clean_37_alias.txt"
TSS_DIR="${BASE_DIR}/TSS_chr"
LOG_FILE="${BASE_DIR}/copy_tss_chr_fast_fixed.log"
UNMATCHED="${BASE_DIR}/unmatched_genes.txt"
mkdir -p "$TSS_DIR"
: > "$LOG_FILE"
: > "$UNMATCHED"
# ——— PROCESS EACH SUBDIR ———
for subdir in "$BASE_DIR"/*/; do
[[ "${subdir%/}" = "$TSS_DIR" ]] && continue
name=$(basename "$subdir")
gene=${name%%_*}
geneU=$(printf "%s" "$gene" | tr '[:lower:]' '[:upper:]')
# lookup chrom via awk (first match)
chrom=$(awk -F'\t' -v G="$geneU" '
NR>1 && (toupper($6)==G || toupper($7)==G) { print $2; exit }
' "$ENSEMBL_FILE" )
if [[ -z "$chrom" ]]; then
printf "❌ No chrom for %s\n" "$gene" >>"$LOG_FILE"
printf "%s\n" "$gene" >>"$UNMATCHED"
continue
fi
# turn underscores into colons to match your filenames
idpart=${name//_/:}
fname="discovery_chr${chrom}_${idpart}_rsid_added.gz"
src="$subdir/$fname"
if [[ -f "$src" ]]; then
cp "$src" "$TSS_DIR/"
printf "✅ Copied %s\n" "$fname" >>"$LOG_FILE"
else
printf "⚠️ Missing %s in %s\n" "$fname" "$name" >>"$LOG_FILE"
fi
done
echo "Done. See $LOG_FILE and $UNMATCHED"
The original script above could have broken about halfway through due to the following:
Associative arrays unsupported
macOS ships with Bash 3.2, which doesn’t support
declare -A.
→ Removed all associative‐array usage; switched to inline
awk lookups.
Uppercase expansion ${var^^}
unsupported
That syntax only exists in Bash 4+.
→ Replaced with the portable:
geneU=$(printf "%s" "$gene" | tr '[:lower:]' '[:upper:]')
Stray backticks & mismatched quotes
Unmatched backticks or curly quotes caused parse errors.
→ Stripped out any stray backticks and standardized on simple
'…' or "…" pairs.
Over-eager underscore→colon replacement
Converting every underscore mangled multi-part IDs.
→ Extract only the suffix after the gene name, then replace its
underscores with colons.
Directories without any matching chr files
Some subdirs lack any discovery_chr… files, so the script
logged “Missing …” but did nothing else.
→ Now logs and continues, without aborting or duplicating work.
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'
# ——— CONFIG ———
BASE_DIR="/Volumes/LaCie/ukbppp/coloc_clin_mol_extracted"
ENSEMBL_FILE="/Volumes/LaCie/tahir/mr_ukbppp_bp/mart_clean_37_alias.txt"
TSS_DIR="${BASE_DIR}/TSS_chr"
LOG_FILE="${BASE_DIR}/copy_tss_chr_fast_fixed.log"
UNMATCHED="${BASE_DIR}/unmatched_genes.txt"
mkdir -p "$TSS_DIR"
: > "$LOG_FILE"
: > "$UNMATCHED"
# ——— LOOP OVER EACH SUB-DIRECTORY ———
for subdir in "$BASE_DIR"/*/; do
# skip the TSS_chr directory itself
[[ "${subdir%/}" = "$TSS_DIR" ]] && continue
name=$(basename "$subdir") # e.g. OSMR_Q99650_OID20356_v1_Cardiometabolic
gene=${name%%_*} # e.g. OSMR
geneU=${gene^^} # uppercase (just in case)
# look up chr in the mart file
chrom=$(awk -F'\t' -v G="$geneU" '
NR>1 && (toupper($6)==G || toupper($7)==G) { print $2; exit }
' "$ENSEMBL_FILE")
if [[ -z "$chrom" ]]; then
echo "❌ No chromosome for $geneU" >>"$LOG_FILE"
echo "$geneU" >>"$UNMATCHED"
continue
fi
# split prefix vs trait so we only turn underscores into colons in the *prefix*
prefix="${name%_*}" # OSMR_Q99650_OID20356_v1
trait="${name##*_}" # Cardiometabolic
prefix_colon="${prefix//_/:}" # OSMR:Q99650:OID20356:v1
idpart="${prefix_colon}_${trait}"
fname="discovery_chr${chrom}_${idpart}_rsid_added.gz"
src="${subdir}${fname}"
dst="${TSS_DIR}/${fname}"
# skip if already copied
if [[ -f "$dst" ]]; then
echo "⏭ Skipped (already have) $fname" >>"$LOG_FILE"
continue
fi
if [[ -f "$src" ]]; then
cp "$src" "$TSS_DIR/"
echo "✅ Copied $fname" >>"$LOG_FILE"
else
echo "⚠️ Missing $fname in $name" >>"$LOG_FILE"
fi
done
echo "Done. See log: $LOG_FILE"
echo "Unmatched genes in: $UNMATCHED"
This is using MyGeneInfo to fetch aliases
#!/usr/bin/env python3
"""
Read a list of unmatched gene symbols, query MyGene.info for aliases/other_names,
and write results to a TSV.
"""
import os
from mygene import MyGeneInfo
def main():
# ── CONFIG ──
infile = "/Volumes/LaCie/ukbppp/coloc_clin_mol_extracted/unmatched_genes_fixed2.txt"
outfile = "unmatched_genes_with_aliases.tsv"
# check input
if not os.path.isfile(infile):
raise FileNotFoundError(f"Input file not found: {infile}")
# read queries
with open(infile) as f:
genes = [line.strip() for line in f if line.strip()]
print(f"→ {len(genes)} genes to query")
mg = MyGeneInfo()
# querymany defaults to batch‐size=1000, will chunk automatically
results = mg.querymany(
genes,
scopes="symbol",
fields="alias,other_names,entrezgene,ensembl",
species="human",
as_dataframe=False,
verbose=False
)
# write TSV header
with open(outfile, "w") as out:
out.write("query\talias\tother_names\tentrezgene\tensembl_gene\n")
for entry in results:
q = entry.get("query", "")
# gather aliases
aliases = entry.get("alias", [])
if isinstance(aliases, str):
aliases = [aliases]
# gather other_names
other = entry.get("other_names", [])
if isinstance(other, str):
other = [other]
# entrezgene
eg = entry.get("entrezgene", "")
# ensembl can be dict or list
ensembl = entry.get("ensembl", "")
if isinstance(ensembl, dict):
ensg = ensembl.get("gene", "")
elif isinstance(ensembl, list):
ensg = ",".join(item.get("gene","") for item in ensembl)
else:
ensg = ""
# write line
out.write(
f"{q}\t"
f"{','.join(aliases)}\t"
f"{','.join(other)}\t"
f"{eg}\t"
f"{ensg}\n"
)
print(f"✅ Written results to {outfile}")
if __name__ == "__main__":
main()
./retry_copy_tss_chr.sh
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'
# ── CONFIG ──
BASE_DIR="/Volumes/LaCie/ukbppp/coloc_clin_mol_extracted"
ENSEMBL_FILE="/Volumes/LaCie/tahir/mr_ukbppp_bp/mart_clean_37_alias.txt"
UNMATCHED_ALIAS_TSV="/Volumes/LaCie/ukbppp/scripts/unmatched_genes_with_aliases.tsv"
TSS_DIR="$BASE_DIR/TSS_chr"
LOG_FILE="$BASE_DIR/retry_copy.log"
mkdir -p "$TSS_DIR"
: > "$LOG_FILE"
# ── FOR EACH unmatched gene + aliases ──
tail -n +2 "$UNMATCHED_ALIAS_TSV" | \
while IFS=$'\t' read -r query alias other_names entrezgene ensembl; do
# uppercase the query & build comma-sep list of synonyms
UQ=$(printf "%s" "$query" | tr '[:lower:]' '[:upper:]')
syns="$UQ"
[[ -n "$alias" ]] && syns+=",${alias}"
[[ -n "$other_names" ]] && syns+=",${other_names}"
# find chromosome by scanning mart file for any synonym
chrom=""
IFS=',' read -ra ARR <<<"$syns"
for s in "${ARR[@]}"; do
SU=$(printf "%s" "$s" | tr '[:lower:]' '[:upper:]')
chrom=$(awk -F'\t' -v G="$SU" '
NR>1 && (toupper($6)==G || toupper($7)==G) { print $2; exit }
' "$ENSEMBL_FILE")
[[ -n "$chrom" ]] && break
done
if [[ -z "$chrom" ]]; then
echo "❌ No chr for $query (aliases: $syns)" >>"$LOG_FILE"
continue
fi
# locate that gene’s subdirectory under BASE_DIR
subdir=$(find "$BASE_DIR" -maxdepth 1 -type d -name "${query}_*" | head -n1)
if [[ -z "$subdir" ]]; then
echo "⚠️ No subdir for $query" >>"$LOG_FILE"
continue
fi
name=$(basename "$subdir")
idpart=${name//_/:}
fname="discovery_chr${chrom}_${idpart}_rsid_added.gz"
src="$subdir/$fname"
if [[ -f "$src" ]]; then
cp "$src" "$TSS_DIR/"
echo "✅ Copied $query → $fname" >>"$LOG_FILE"
else
echo "⚠️ Missing $fname in $subdir" >>"$LOG_FILE"
fi
done
echo "Done retry. See $LOG_FILE"
Processed previously. SBP file located at:
/Volumes/LaCie/tahir/mr_ukbppp_bp/ieu-b-38_formatted.tsv
cd /Volumes/LaCie/tahir/mr_ukbppp_bp
wget https://gwas.mrcieu.ac.uk/files/ieu-b-38/ieu-b-38.vcf.gz
“Here, we report genome-wide discovery analyses of BP traits - systolic (SBP), diastolic (DBP) and pulse pressure (PP) - in people of European ancestry drawn from UK Biobank (UKB) and the International Consortium of Blood Pressure-Genome Wide Association Studies (ICBP). We adopted a combination of a one- and two-stage study design to test common and low-frequency single nucleotide polymorphisms (SNPs) with minor allele frequency (MAF) ≥ 1% associated with BP traits. In all, we studied over 1 million people of European descent, including replication data from the US Million Veterans Program (MVP, N=220,520) and the Estonian Genome Centre, University of Tartu (EGCUT, N=28,742) Biobank.”
For more details, refer to the full paper.
Step 1: Bash script calling bcftools to convert vcf to tsv and format the headers for TwoSampleMR
Dependencies:
• bcftools (for querying VCF)
• awk (for formatting)
• bgzip (for compressing the output)
run with: cd /Volumes/LaCie/tahir/mr_ukbppp_bp/scripts chmod +x format_ieu-b-38.vcf.sh nohup ./format_ieu-b-38.vcf.sh > format_ieu-b-38.vcf.log 2>&1 & ps -p 96562 -o stat,etime
# Step 1: Decompress ieu-b-38.vcf.gz without deleting the original
cd /Volumes/LaCie/tahir/mr_ukbppp_bp
gunzip -k ieu-b-38.vcf.gz
# Step 2: Convert the decompressed .vcf file to a .tsv format and calculate p-values from logp
outfile="ieu-b-38_formatted.tsv"
# Write header
echo -e "CHR\tBP\tSNP\tREF\tALT\tEAF\tBETA\tSE\tLP\tPVAL" > "$outfile"
# Extract fields and calculate pval
awk 'BEGIN{OFS="\t"}
!/^#/ {
split($10, a, ":");
beta = a[1];
se = a[2];
lp = a[3];
eaf = a[4];
pval = 10^(-lp);
print $1, $2, $3, $4, $5, eaf, beta, se, lp, pval
}' ieu-b-38.vcf >> "$outfile"
./preprocess.sh
#!/bin/bash
# Base directories
BASE_DIR="/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data"
TEMP_DIR="${BASE_DIR}/temp"
mkdir -p "${TEMP_DIR}"
# Function to process a single file
process_file() {
file="$1"
filename=$(basename "$file")
temp_file="${TEMP_DIR}/${filename%.gz}.temp"
output_file="${TEMP_DIR}/${filename}"
# Skip if already preprocessed
if [ -f "$output_file" ]; then
echo "Skipping $filename: Already preprocessed"
return
fi
# Decompress and process
gunzip -c "$file" | awk '
BEGIN { FS=OFS="\t" }
{
n = split($0, fields, /[ \t]+/)
if (NR == 1) {
for (i = 1; i <= n; i++) {
if (i > 1) printf "\t"
printf "%s", fields[i]
}
printf "\n"
} else {
for (i = 1; i <= 14; i++) {
if (i > 1) printf "\t"
printf "%s", fields[i]
}
for (i = 15; i <= n; i++) {
printf "\t%s", fields[i]
}
printf "\n"
}
}' > "$temp_file"
# Compress the temp file back to .gz
gzip -c "$temp_file" > "$output_file"
rm "$temp_file"
echo "Preprocessed $filename"
}
export -f process_file
export TEMP_DIR
# Process files in parallel (adjust --jobs to your CPU cores, e.g., 8)
find "${BASE_DIR}" -maxdepth 1 -name "*.gz" | parallel --jobs 8 process_file {}
This is to get the files in the same format as what worked in previous runs of 200 proteins.
Rscript name_P_fix.R
# Load required packages
library(data.table)
library(parallel)
# Define directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data"
temp_dir <- file.path(base_dir, "temp")
fixed_dir <- file.path(base_dir, "fixed")
# Ensure fixed directory exists
dir.create(fixed_dir, showWarnings = FALSE, recursive = TRUE)
# Function to process a single file
process_file <- function(input_file) {
tryCatch({
# Output file (renamed: remove prefix, replace colons with underscores)
output_filename <- sub("discovery_chr[0-9]+_", "", basename(input_file))
output_filename <- gsub(":", "_", output_filename)
output_file <- file.path(fixed_dir, output_filename)
# Read the preprocessed file
data <- fread(
file = input_file,
header = TRUE,
sep = "\t",
na.strings = "NA",
colClasses = c(
CHROM = "character",
GENPOS = "integer",
ID = "character",
ALLELE0 = "character",
ALLELE1 = "character",
A1FREQ = "numeric",
INFO = "numeric",
N = "integer",
TEST = "character",
BETA = "numeric",
SE = "numeric",
CHISQ = "numeric",
LOG10P = "numeric",
EXTRA = "character",
rsid = "character",
POS19 = "integer",
POS38 = "integer"
)
)
# Add P column
data[, P := 10^(-LOG10P)]
# Reorder columns to place P after POS38
setcolorder(data, c(
"CHROM", "GENPOS", "ID", "ALLELE0", "ALLELE1", "A1FREQ", "INFO", "N",
"TEST", "BETA", "SE", "CHISQ", "LOG10P", "EXTRA", "rsid", "POS19", "POS38", "P"
))
# Save to fixed directory
fwrite(
data,
file = output_file,
sep = "\t",
na = "NA",
compress = "gzip",
quote = FALSE
)
cat(sprintf("Processed %s -> %s\n", basename(input_file), basename(output_file)))
}, error = function(e) {
cat(sprintf("Error processing %s: %s\n", basename(input_file), conditionMessage(e)))
})
}
# Get list of .gz files in temp directory
files <- list.files(temp_dir, pattern = "\\.gz$", full.names = TRUE)
# Process files in parallel (adjust number of cores as needed)
num_cores <- detectCores() - 1 # Use all but one core
mclapply(files, process_file, mc.cores = num_cores)
cat("All files processed\n")
Remove
# Remove empty files listed by empty.py
while read -r file; do
rm "$file"
done <<EOF
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ADAMTSL4:Q6UY14:OID30275:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ADAMTS4:O75173:OID30471:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EXOSC10:Q01780:OID30083:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AGRN:O00468:OID20786:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AGT:P01019:OID30709:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AK2:P54819:OID31142:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AKR7L:Q8NHP1:OID30488:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AKT3:Q9Y243:OID21197:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AMIGO1:Q86WK6:OID30881:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AMY2A:P04746:OID20333:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_AMY2B:P19961:OID20340:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ANGPTL1:O95841:OID20211:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ANGPTL3:Q9Y5C1:OID20407:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ANGPTL7:O43827:OID21412:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_APCS:P02743:OID30778:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ARG1:P05089:OID21445:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ARHGAP30:Q7Z6I6:OID31336:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ARID4B:Q4LE39:OID20803:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ARNT:P27540:OID20432:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ARTN:Q5T4W7:OID20446:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ATP1B1:P05026:OID30124:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ATP2B4:P23634:OID30062:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ATP5IF1:Q9UII2:OID20760:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BCAN:Q96GW7:OID20998:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BCL2L15:Q5TBC7:OID30498:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BGLAP:P02818:OID30389:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BOLA1:Q9Y3E2:OID30837:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BRDT:Q58F21:OID30179:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BSND:Q8WZ55:OID31184:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_BTNL10:A8MVZ5:OID31196:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_C1QA:P02745:OID20654:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_C4BPB:P20851:OID21481:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_C8B:P07358:OID30719:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CA14:Q9ULX7:OID21401:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CA6:P23280:OID21096:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CACYBP:Q9HB71:OID30649:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CASP9:P55211:OID30530:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CASQ2:O14958:OID30063:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD101:Q93033:OID31480:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD160:O95971:OID20647:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD164L2:Q6UWJ8:OID30922:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD1C:P29017:OID21483:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD2:P06729:OID30097:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD244:Q9BZW8:OID20628:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD34:P28906:OID21025:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD46:P15529:OID20380:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD48:P09326:OID20692:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD55:P08174:OID20377:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD58:P19256:OID20716:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD5L:O43866:OID30775:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CD84:Q9UIB8:OID20607:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CDA:P32320:OID30345:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CELA2A:P08217:OID30411:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CELA3A:P09093:OID20298:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CELSR2:Q9HCU4:OID30593:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CENPF:P49454:OID31329:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CEP170:Q5SW79:OID30240:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CEP350:Q5VT06:OID31247:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CEP85:Q6P2H3:OID21241:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CFH:P08603:OID30790:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CFHR2:P36980:OID30720:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CFHR4:Q92496:OID30758:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CFHR5:Q9BXR6:OID30716:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CGN:Q9P2M7:OID30894:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CHI3L1:P36222:OID20399:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CHIT1:Q13231:OID20312:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CLSPN:Q9HAW4:OID20831:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CLSTN1:O94985:OID20874:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CNST:Q6PJW8:OID20127:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CNTN2:Q02246:OID21426:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_COL24A1:Q17RW2:OID31464:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_COL9A2:Q14055:OID31255:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CPTP:Q5TA50:OID30106:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CR1:P17927:OID30697:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CR2:P20023:OID20393:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CREG1:O75629:OID21515:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CRNN:Q9UBG3:OID21427:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CSDE1:O75534:OID30183:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CSF1:P09603:OID20719:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CSF3R:Q99062:OID30527:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CTBS:Q01459:OID30710:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CTRC:Q99895:OID20752:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CTSE:P14091:OID30628:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CTSO:P43234:OID20660:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_CTSS:P25774:OID21056:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DDAH1:O94760:OID21299:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DDI2:Q5TDH0:OID30602:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DFFA:O00273:OID20620:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DNAJC6:O75061:OID30247:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DNM3:Q9UQ16:OID31000:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DPT:Q07507:OID20278:v1:Cardiometabolic_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_DRAXIN:Q8NBI3:OID20917:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ECE1:P42892:OID20891:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ECM1:Q16610:OID30736:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EFCAB14:O75071:OID31460:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EFCAB2:Q5VUJ9:OID31313:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EFNA1:P20827:OID21125:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EFNA4:P52798:OID20935:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EGLN1:Q9GZT9:OID20572:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EIF4G3:O43432:OID31022:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ELAVL4:P26378:OID30854:v1:Neurology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ELOA:Q14241:OID21337:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ENAH:Q8N8S7:OID20497:v1:Inflammation_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ENO1:P06733:OID21037:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ENSA:O43768:OID31355:v1:Oncology_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EPHA10:Q5JZY3:OID20840:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EPHA2:P29317:OID21342:v1:Oncology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_ERMAP:Q96PL5:OID30539:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_EVI5:O60447:OID30519:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_F11R:Q9Y624:OID21151:v1:Neurology_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_F13B:P05160:OID30781:v1:Inflammation_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_FABP3:P05413:OID30351:v1:Cardiometabolic_II_rsid_added.gz
/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp/discovery_chr1_FASLG:P48023:OID20665:v1:Inflammation_rsid_added.gz
EOF
# Verify removal
find /Volumes/LaCie/tahir/mr_ukbppp_all_sbp/data/temp -name "*.gz" -size 0
The original script with both harmonization, ld-clump, and MR failed due to issues with regrex related to the file names for the proteins.It worked up to harmonizing and saving the harmonized files in the ld_clump subdirs for each protein. I’ve reproduced a script that just harmonizes below.
cd /Volumes/LaCie/tahir/mr_ukbppp_all_sbp/scripts nohup Rscript mr_all_harmonization.R > mr_all_harmonization.log 2>&1 &
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
library(foreach)
library(doParallel)
library(UniProt.ws)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS for single-threaded performance
blas_set_num_threads(1)
# Set directories and files
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
data_dir <- file.path(base_dir, "data/fixed")
results_dir <- file.path(base_dir, "results_500KB_tss")
clump_dirs <- c("r2_0001", "r2_001", "r2_01")
# Create subdirectories for each clump value
for (dir in clump_dirs) {
dir_path <- file.path(results_dir, dir)
if (!dir.exists(dir_path)) dir.create(dir_path, recursive = TRUE)
}
# Check if data_dir exists
if (!dir.exists(data_dir)) {
stop(sprintf("Data directory %s does not exist. Please verify the path.", data_dir))
}
# Outcome configuration for SBP
outcome_config <- list(
name = "SBP",
file = file.path(base_dir, "ieu-b-38_formatted.tsv"),
id = "ieu-b-38"
)
# Verify outcome file exists
if (!file.exists(outcome_config$file)) {
stop(sprintf("Outcome file %s does not exist. Please verify the file name and path.", outcome_config$file))
}
# Annotation file
local_annotation_file <- file.path(base_dir, "mart_clean_37_alias.txt")
if (!file.exists(local_annotation_file)) {
stop(sprintf("Local annotation file %s not found", local_annotation_file))
}
# Load annotation file
gene_data <- fread(local_annotation_file, data.table = FALSE)
cat(sprintf("Loaded local annotation file with %d genes\n", nrow(gene_data)))
# List all protein files
protein_files <- list.files(data_dir, pattern = "^[A-Z0-9]+_[A-Z0-9]+_OID[0-9]+_v1_.*\\.gz$", full.names = TRUE)
cat("Total proteins to process:", length(protein_files), "\n")
# Function for harmonization (kept exactly as in your script)
harmonize_protein <- function(protein_file, outcome_file, results_dir, clump_dirs, gene_data) {
olink_id <- sub("_v1_.*$", "", basename(protein_file))
cat(sprintf("Processing %s...\n", olink_id))
# Get TSS and chromosome
parts <- strsplit(olink_id, "_")[[1]]
protein <- parts[1]
protein_data <- gene_data %>%
filter(tolower(`Gene name`) == tolower(protein) | tolower(`Gene Synonym`) == tolower(protein))
if (nrow(protein_data) == 0) {
cat(sprintf("No TSS data for %s in annotation file; skipping\n", olink_id))
return(NULL)
}
protein_chrom <- as.character(protein_data$`Chromosome/scaffold name`[1])
protein_tss <- as.numeric(protein_data$`Transcription start site (TSS)`[1])
window_size <- 500000
tss_start <- protein_tss - window_size
tss_end <- protein_tss + window_size
# Format exposure data
exp_dat <- fread(protein_file) %>%
filter(
CHROM == protein_chrom,
POS19 >= tss_start,
POS19 <= tss_end,
!is.na(rsid),
grepl("^rs", rsid)
) %>%
dplyr::select(
SNP = rsid,
chr.exposure = CHROM,
pos.exposure = POS19,
effect_allele.exposure = ALLELE1,
other_allele.exposure = ALLELE0,
eaf.exposure = A1FREQ,
beta.exposure = BETA,
se.exposure = SE,
pval.exposure = P,
samplesize.exposure = N
) %>%
mutate(
exposure = olink_id,
id.exposure = olink_id,
chr.exposure = as.character(chr.exposure)
)
# Format outcome data
out_dat <- fread(outcome_file) %>%
filter(
CHR == protein_chrom,
BP >= tss_start,
BP <= tss_end,
!is.na(SNP),
grepl("^rs", SNP)
) %>%
dplyr::select(
SNP = SNP,
chr.outcome = CHR,
pos.outcome = BP,
effect_allele.outcome = ALT,
other_allele.outcome = REF,
eaf.outcome = EAF,
beta.outcome = BETA,
se.outcome = SE,
pval.outcome = PVAL
) %>%
mutate(
outcome = outcome_config$name,
id.outcome = outcome_config$id,
chr.outcome = as.character(chr.outcome),
samplesize.outcome = 757601
)
# Harmonization step
dat <- tryCatch({
harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat, action = 1)
}, error = function(e) {
cat(sprintf("Harmonization failed for %s: %s\n", olink_id, e$message))
return(NULL)
})
if (is.null(dat) || nrow(dat) == 0) {
cat(sprintf("No harmonized data for %s; skipping\n", olink_id))
return(NULL)
}
# Save harmonized data to each LD clump directory
for (dir in clump_dirs) {
clump_dir <- file.path(results_dir, dir, olink_id)
if (!dir.exists(clump_dir)) dir.create(clump_dir, recursive = TRUE)
harmonized_file <- file.path(clump_dir, sprintf("tss_harmonized_%s_%s.csv", olink_id, outcome_config$name))
write.csv(dat, harmonized_file, row.names = FALSE)
cat(sprintf("Harmonized data saved for %s in %s: %s\n", olink_id, dir, harmonized_file))
}
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Apply harmonization on all protein files in parallel
foreach(file = protein_files, .packages = c("dplyr", "data.table", "TwoSampleMR")) %dopar% {
harmonize_protein(file, outcome_config$file, results_dir, clump_dirs, gene_data)
}
cat("Harmonization process completed for all proteins in results_500KB_tss.\n")
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
results_dir <- file.path(base_dir, "results_500KB_tss")
if (!dir.exists(results_dir)) stop("Results directory does not exist:", results_dir)
# LD clumping thresholds and corresponding directories
clump_r2_values <- c(0.1, 0.01, 0.001)
clump_labels <- c("r2_01", "r2_001", "r2_0001")
# Outcome configuration
outcome_config <- list(name = "SBP", id = "ieu-b-38")
# Set up logging
log_file <- file.path(base_dir, "mr_analysis_full.log")
sink(log_file, append = TRUE, split = TRUE)
cat(sprintf("Starting full MR analysis on %s\n", Sys.time()))
# Function to perform MR analysis
perform_mr <- function(harmonized_file, protein_dir, olink_id, clump_r2, clump_label) {
today <- Sys.Date()
clumped_file <- file.path(protein_dir, sprintf("clumped_%s_%s_r2%s_%s.csv", olink_id, outcome_config$name, clump_label, today))
excel_file <- file.path(protein_dir, sprintf("MR_Results_%s_%s_r2%s_%s.xlsx", olink_id, outcome_config$name, clump_label, today))
plot_file <- file.path(protein_dir, sprintf("MR_Forestplot_%s_%s_r2%s_%s.png", olink_id, outcome_config$name, clump_label, today))
# Check for existing files
if (file.exists(clumped_file) || file.exists(excel_file) || file.exists(plot_file)) {
cat(sprintf("Skipping %s (r2 = %s): Existing files found\n", olink_id, clump_r2))
return(NULL)
}
# Read harmonized data
cat(sprintf("Reading harmonized file: %s\n", harmonized_file))
dat <- fread(harmonized_file)
if (nrow(dat) == 0) {
cat(sprintf("No data in harmonized file for %s; skipping\n", olink_id))
return(NULL)
}
# Filter instruments
instruments <- dat %>%
filter(mr_keep == TRUE, pval.exposure < 5e-6) %>%
mutate(
rsid = SNP,
pval = pval.exposure,
id = sprintf("%s exposure", olink_id),
F_stat = (beta.exposure / se.exposure)^2
) %>%
filter(F_stat > 10)
cat(sprintf("Number of instruments for %s: %d\n", olink_id, nrow(instruments)))
if (nrow(instruments) == 0) {
cat(sprintf("No valid instruments for %s; skipping\n", olink_id))
return(NULL)
}
# Steiger filtering
instruments <- steiger_filtering(instruments)
if (!"steiger_dir" %in% colnames(instruments) || nrow(instruments) == 0) {
cat(sprintf("Steiger filtering failed for %s; skipping\n", olink_id))
return(NULL)
}
instruments <- instruments %>% filter(steiger_dir == TRUE)
cat(sprintf("Instruments after Steiger filtering for %s: %d\n", olink_id, nrow(instruments)))
if (nrow(instruments) == 0) return(NULL)
# LD clumping
clumped_dat <- NULL
tryCatch({
clumped <- ld_clump(
dplyr::tibble(rsid = instruments$rsid, pval = instruments$pval, id = instruments$id),
plink_bin = genetics.binaRies::get_plink_binary(),
bfile = "/Users/charleenadams/1000G_bfiles/EUR/EUR",
clump_r2 = clump_r2,
clump_kb = 500
)
clumped_dat <- instruments %>% filter(SNP %in% clumped$rsid)
cat(sprintf("SNPs after clumping for %s (r2 = %s): %d\n", olink_id, clump_r2, nrow(clumped_dat)))
write.csv(clumped_dat, clumped_file, row.names = FALSE)
}, error = function(e) {
cat(sprintf("LD clumping failed for %s (r2 = %s): %s\n", olink_id, clump_r2, e$message))
return(NULL)
})
if (is.null(clumped_dat) || nrow(clumped_dat) == 0) {
cat(sprintf("No SNPs after clumping for %s (r2 = %s); skipping\n", olink_id, clump_r2))
return(NULL)
}
# Perform MR analyses
ivw_result <- mr(clumped_dat, method_list = "mr_ivw")
egger_result <- if (nrow(clumped_dat) >= 3) {
tryCatch(mr(clumped_dat, method_list = "mr_egger_regression"), error = function(e) NULL)
} else NULL
weighted_median_result <- mr(clumped_dat, method_list = "mr_weighted_median")
mr_input <- mr_input(
bx = clumped_dat$beta.exposure,
bxse = clumped_dat$se.exposure,
by = clumped_dat$beta.outcome,
byse = clumped_dat$se.outcome,
exposure = olink_id,
outcome = outcome_config$name,
snps = clumped_dat$SNP
)
lasso_result <- tryCatch(mr_lasso(mr_input), error = function(e) {
cat(sprintf("MR-Lasso failed for %s: %s\n", olink_id, e$message)); NULL
})
conmix_result <- tryCatch(mr_conmix(mr_input), error = function(e) {
cat(sprintf("MR-ConMix failed for %s: %s\n", olink_id, e$message)); NULL
})
heterogeneity_result <- mr_heterogeneity(clumped_dat)
pleiotropy_result <- if (nrow(clumped_dat) >= 3) mr_pleiotropy_test(clumped_dat) else NULL
loo_result <- mr_leaveoneout(clumped_dat)
wald_ratios <- clumped_dat %>%
mutate(
wald_beta = beta.outcome / beta.exposure,
wald_se = sqrt((se.outcome^2 / beta.exposure^2) + ((beta.outcome^2 * se.exposure^2) / (beta.exposure^4))),
pval = 2 * pnorm(abs(wald_beta / wald_se), lower.tail = FALSE),
method = paste("Wald Ratio:", SNP)
) %>%
dplyr::select(SNP, wald_beta, wald_se, pval, method)
# Process MR results for forest plot
mr_results <- bind_rows(
if (nrow(ivw_result) > 0) ivw_result[, c("method", "b", "se", "pval")],
if (!is.null(egger_result) && nrow(egger_result) > 0) egger_result[, c("method", "b", "se", "pval")],
if (nrow(weighted_median_result) > 0) weighted_median_result[, c("method", "b", "se", "pval")],
if (!is.null(lasso_result)) data.frame(
method = "MR-Lasso",
b = lasso_result@Estimate,
se = lasso_result@StdError,
pval = lasso_result@Pvalue
),
if (!is.null(conmix_result)) data.frame(
method = "MR-ConMix",
b = conmix_result@Estimate,
se = (conmix_result@CIUpper - conmix_result@CILower)/(2*1.96),
pval = conmix_result@Pvalue
),
wald_ratios %>% dplyr::select(method, b = wald_beta, se = wald_se, pval)
) %>%
mutate(
method = factor(method, levels = unique(method)),
or = exp(b),
or_lower = exp(b - 1.96 * se),
or_upper = exp(b + 1.96 * se)
) %>%
na.omit()
# Define colors for forest plot
base_colors <- c("IVW" = "#1F78B4", "MR-Egger" = "#FF7F00", "Weighted Median" = "#33A02C",
"MR-Lasso" = "#FB9A99", "MR-ConMix" = "#E41A1C")
wald_methods <- unique(mr_results$method[grepl("Wald Ratio", mr_results$method)])
wald_colors <- if (length(wald_methods) > 0) setNames(hue_pal()(length(wald_methods)), wald_methods) else NULL
color_list <- c(base_colors, wald_colors)
color_list <- color_list[names(color_list) %in% unique(mr_results$method)]
# Create forest plot
x_min <- min(mr_results$or_lower, na.rm = TRUE)
x_max <- max(mr_results$or_upper, na.rm = TRUE)
protein <- strsplit(olink_id, "_")[[1]][1]
forest_plot <- ggplot(mr_results, aes(x = or, y = method, color = method)) +
geom_errorbarh(aes(xmin = or_lower, xmax = or_upper), height = 0.2, na.rm = TRUE, linewidth = 0.8) +
geom_point(size = 3) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
labs(
title = sprintf("Mendelian Randomization Estimates:\n %s on %s (r2 = %s)", protein, outcome_config$name, clump_r2),
x = "Causal Effect (Beta)",
y = ""
) +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
legend.position = "none",
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA)
) +
scale_color_manual(values = color_list) +
xlim(x_min - 0.02 * (x_max - x_min), x_max + 0.02 * (x_max - x_min))
# Save outputs to Excel
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || !is.data.frame(data) || nrow(data) == 0 || ncol(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out",
"Instruments", "Clumped_Data"),
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis",
"Filtered Instruments", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
add_formatted_sheet(wb, "IVW", ivw_result, "Inverse Variance Weighted Results")
add_formatted_sheet(wb, "MR-Egger", egger_result, "MR-Egger Results")
add_formatted_sheet(wb, "Weighted_Median", weighted_median_result, "Weighted Median Results")
if (!is.null(lasso_result)) {
lasso_df <- data.frame(
Exposure = lasso_result@Exposure,
Outcome = lasso_result@Outcome,
Estimate = lasso_result@Estimate,
StdError = lasso_result@StdError,
CILower = lasso_result@CILower,
CIUpper = lasso_result@CIUpper,
Pvalue = lasso_result@Pvalue,
SNPs = lasso_result@SNPs,
Valid = lasso_result@Valid,
ValidSNPs = paste(lasso_result@ValidSNPs, collapse = ", ")
)
add_formatted_sheet(wb, "MR-Lasso", lasso_df, "MR-Lasso Results")
}
if (!is.null(conmix_result)) {
conmix_df <- data.frame(
Exposure = conmix_result@Exposure,
Outcome = conmix_result@Outcome,
Estimate = conmix_result@Estimate,
Pvalue = conmix_result@Pvalue,
CILower = conmix_result@CILower,
CIUpper = conmix_result@CIUpper,
SNPs = conmix_result@SNPs
)
add_formatted_sheet(wb, "MR-ConMix", conmix_df, "MR-ConMix Results")
}
add_formatted_sheet(wb, "Wald_Ratios", wald_ratios, "Wald Ratio Tests")
add_formatted_sheet(wb, "Heterogeneity_Test", heterogeneity_result, "Heterogeneity Test Results")
add_formatted_sheet(wb, "Pleiotropy_Test", pleiotropy_result, "Pleiotropy Test Results")
add_formatted_sheet(wb, "Leave_One_Out", loo_result, "Leave-One-Out Analysis")
add_formatted_sheet(wb, "Instruments", instruments, "Filtered Instruments")
add_formatted_sheet(wb, "Clumped_Data", clumped_dat, "Clumped SNPs Data")
saveWorkbook(wb, excel_file, overwrite = FALSE)
ggsave(plot_file, forest_plot, width = 12, height = 10, dpi = 600, bg = "white")
cat(sprintf("Results for %s (r2 = %s) saved to: %s and %s\n", olink_id, clump_r2, excel_file, plot_file))
return(list(
ivw = ivw_result, egger = egger_result, weighted_median = weighted_median_result,
lasso = lasso_result, conmix = conmix_result, wald_ratios = wald_ratios,
heterogeneity = heterogeneity_result, pleiotropy = pleiotropy_result,
loo = loo_result, instruments = instruments, clumped = clumped_dat,
protein_id = olink_id, r2_value = clump_label
))
}
# Main loop for all proteins
master_data <- list()
for (i in seq_along(clump_r2_values)) {
clump_r2 <- clump_r2_values[i]
clump_label <- clump_labels[i]
clump_dir <- file.path(results_dir, clump_label)
cat(sprintf("Checking directory: %s\n", clump_dir))
if (!dir.exists(clump_dir)) {
cat(sprintf("Directory %s does not exist; skipping\n", clump_dir))
next
}
protein_dirs <- list.dirs(clump_dir, recursive = FALSE, full.names = FALSE)
cat(sprintf("Protein directories in %s:\n", clump_dir))
print(protein_dirs)
for (protein in protein_dirs) {
protein_dir <- file.path(clump_dir, protein)
cat(sprintf("Checking protein directory: %s\n", protein_dir))
if (!dir.exists(protein_dir)) {
cat(sprintf("Protein directory %s does not exist; skipping\n", protein_dir))
next
}
# List all files in protein directory for debugging
all_files <- list.files(protein_dir, full.names = TRUE)
cat(sprintf("Files in %s:\n", protein_dir))
print(all_files)
# Search for harmonized file with a flexible pattern
harmonized_file <- list.files(protein_dir, pattern = sprintf("tss_harmonized_%s_%s.*\\.csv$", protein, outcome_config$name), full.names = TRUE)
cat(sprintf("Searching for harmonized file with pattern: tss_harmonized_%s_%s.*\\.csv\n", protein, outcome_config$name))
cat(sprintf("Found harmonized files:\n"))
print(harmonized_file)
if (length(harmonized_file) == 0) {
cat(sprintf("No harmonized file found for %s in %s; skipping\n", protein, clump_label))
next
}
result <- perform_mr(harmonized_file[1], protein_dir, protein, clump_r2, clump_label)
if (!is.null(result)) master_data[[length(master_data) + 1]] <- result
}
}
# Create master file
today <- Sys.Date()
master_file <- file.path(base_dir, sprintf("Master_MR_Results_%s_%s.xlsx", outcome_config$name, today))
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
tabs <- c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out",
"Instruments", "Clumped_Data")
master_data_combined <- setNames(vector("list", length(tabs)), tabs)
for (result in master_data) {
protein_id <- result$protein_id
r2_value <- result$r2_value
for (tab in tabs) {
data <- result[[tolower(tab)]]
if (is.null(data) || (!is.data.frame(data) && is.null(data@Estimate)) || nrow(data) == 0) next
if (tab %in% c("MR-Lasso", "MR-ConMix") && !is.null(data@Estimate)) {
data <- if (tab == "MR-Lasso") {
data.frame(
Exposure = data@Exposure, Outcome = data@Outcome, Estimate = data@Estimate,
StdError = data@StdError, CILower = data@CILower, CIUpper = data@CIUpper,
Pvalue = data@Pvalue, SNPs = data@SNPs, Valid = data@Valid,
ValidSNPs = paste(data@ValidSNPs, collapse = ", ")
)
} else {
data.frame(
Exposure = data@Exposure, Outcome = data@Outcome, Estimate = data@Estimate,
Pvalue = data@Pvalue, CILower = data@CILower, CIUpper = data@CIUpper,
SNPs = data@SNPs
)
}
}
data$Protein_ID <- protein_id
data$Clump_R2 <- r2_value
master_data_combined[[tab]] <- bind_rows(master_data_combined[[tab]], data)
}
}
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || nrow(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = tabs,
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis",
"Filtered Instruments", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
for (tab in tabs) {
add_formatted_sheet(wb, tab, master_data_combined[[tab]], toc_data$Title[tabs == tab])
}
saveWorkbook(wb, master_file, overwrite = TRUE)
cat(sprintf("Master file saved to: %s\n", master_file))
cat(sprintf("Full MR analysis completed on %s\n", Sys.time()))
sink()
# Standalone script to create master file from MR_Results_*.xlsx files
rm(list = ls())
# Load libraries
library(openxlsx)
library(dplyr)
library(data.table)
library(parallel)
library(doParallel)
library(foreach)
library(R.utils)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
results_dir <- file.path(base_dir, "results_500KB_tss", "parallel")
log_file <- file.path(base_dir, "mr_master_file_creation.log")
# Set up logging
sink(log_file, append = TRUE, split = TRUE)
cat(sprintf("Starting master file creation on %s\n", Sys.time()))
# Validate results directory
if (!dir.exists(results_dir)) {
cat("Error: Results directory does not exist:", results_dir, "\n")
sink()
stop("Results directory does not exist")
}
# Define tabs
tabs <- c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out",
"Instruments", "Clumped_Data")
# Initialize master_data_combined as data.tables
master_data_combined <- setNames(lapply(tabs, function(x) data.table()), tabs)
# Find all MR_Results_*.xlsx files
cat("Searching for MR_Results_*.xlsx files in", results_dir, "\n")
result_files <- list.files(
path = results_dir,
pattern = "MR_Results_.*\\.xlsx$",
recursive = TRUE,
full.names = TRUE
)
cat("Found", length(result_files), "results files\n")
# Function to process a single file
process_file <- function(file) {
local_data <- setNames(lapply(tabs, function(x) data.table()), tabs)
path_parts <- strsplit(dirname(file), .Platform$file.sep)[[1]]
protein_id <- path_parts[length(path_parts)]
r2_value <- path_parts[length(path_parts) - 1]
cat(sprintf("Processing file: %s (protein: %s, r2: %s)\n", basename(file), protein_id, r2_value))
tryCatch({
for (tab in tabs) {
data <- tryCatch({
withTimeout({
openxlsx::read.xlsx(file, sheet = tab, startRow = 3, detectDates = TRUE)
}, timeout = 10)
}, TimeoutException = function(e) {
cat(sprintf("Timeout reading sheet %s from %s\n", tab, basename(file)))
NULL
}, error = function(e) {
cat(sprintf("Error reading sheet %s from %s: %s\n", tab, basename(file), e$message))
NULL
})
if (is.null(data) || nrow(data) == 0) {
cat(sprintf("Skipping tab %s for %s: No data\n", tab, protein_id))
next
}
if (!is.data.frame(data)) {
cat(sprintf("Warning: Sheet %s in %s is not a data frame. Skipping.\n", tab, basename(file)))
next
}
data <- as.data.table(data)
data$Protein_ID <- protein_id
data$Clump_R2 <- r2_value
local_data[[tab]] <- data
}
local_data
}, error = function(e) {
cat(sprintf("Error processing file %s: %s\n", basename(file), e$message))
NULL
})
}
# Process files (parallel with fallback to sequential)
cat("Attempting parallel processing with 4 cores\n")
num_cores <- min(4, detectCores())
cl <- tryCatch({
makeCluster(num_cores)
}, error = function(e) {
cat("Error setting up parallel cluster: ", e$message, "\n")
NULL
})
if (!is.null(cl)) {
registerDoParallel(cl)
clusterEvalQ(cl, {
library(openxlsx)
library(dplyr)
library(data.table)
library(R.utils)
})
batch_size <- ceiling(length(result_files) / num_cores)
batches <- split(result_files, ceiling(seq_along(result_files) / batch_size))
cat(sprintf("Processing %d files in %d batches\n", length(result_files), length(batches)))
results <- foreach(batch = batches, .packages = c("openxlsx", "dplyr", "data.table", "R.utils")) %dopar% {
batch_results <- lapply(batch, process_file)
batch_results
}
results <- unlist(results, recursive = FALSE)
stopCluster(cl)
cat("Parallel cluster stopped\n")
} else {
cat("Falling back to sequential processing\n")
results <- lapply(result_files, process_file)
}
# Merge results
cat("Merging results\n")
for (result in results) {
if (is.null(result)) next
for (tab in tabs) {
if (!is.null(result[[tab]]) && is.data.table(result[[tab]]) && nrow(result[[tab]]) > 0) {
tryCatch({
# Validate column names
if (nrow(master_data_combined[[tab]]) > 0) {
if (!all(names(result[[tab]]) %in% names(master_data_combined[[tab]]))) {
cat(sprintf("Warning: Column mismatch for tab %s. Adjusting columns.\n", tab))
result[[tab]] <- result[[tab]][, names(master_data_combined[[tab]]), with = FALSE]
}
}
master_data_combined[[tab]] <- rbindlist(list(master_data_combined[[tab]], result[[tab]]), use.names = TRUE, fill = TRUE)
cat(sprintf("Merged %d rows for tab %s\n", nrow(result[[tab]]), tab))
}, error = function(e) {
cat(sprintf("Error in rbindlist for tab %s: %s\n", tab, e$message))
})
}
}
gc() # Clean up memory after each result
}
# Create workbook
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
# Function to add formatted sheet
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || nrow(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
cat(sprintf("Added empty sheet: %s\n", sheet_name))
return()
}
tryCatch({
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, as.data.frame(data), startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
cat(sprintf("Added sheet: %s with %d rows\n", sheet_name, nrow(data)))
}, error = function(e) {
cat(sprintf("Error adding sheet %s: %s\n", sheet_name, e$message))
})
}
# Create TOC and other sheets
tryCatch({
toc_data <- data.frame(
Sheet = tabs,
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis",
"Filtered Instruments", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
cat("Added TOC sheet\n")
for (tab in tabs) {
add_formatted_sheet(wb, tab, master_data_combined[[tab]], toc_data$Title[tabs == tab])
}
}, error = function(e) {
cat("Error creating TOC or sheets: ", e$message, "\n")
})
# Save workbook
today <- Sys.Date()
master_file <- file.path(results_dir, sprintf("Master_MR_Results_SBP_%s.xlsx", today))
tryCatch({
saveWorkbook(wb, master_file, overwrite = TRUE)
cat(sprintf("Master file saved to: %s\n", master_file))
}, error = function(e) {
cat(sprintf("Error saving master file %s: %s\n", master_file, e$message))
})
cat(sprintf("Master file creation completed on %s\n", Sys.time()))
sink()
# Clear environment
rm(list = ls())
# Load libraries (minimal set needed for this task)
library(dplyr)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
results_dir <- file.path(base_dir, "results_500KB_tss")
# Define the subdirectories to process
target_subdirs <- c("r2_01", "r2_001", "r2_0001")
# Find subdirectories, keeping only the target ones
subdirs <- list.dirs(results_dir, recursive = FALSE, full.names = TRUE)
subdirs <- subdirs[basename(subdirs) %in% target_subdirs]
# Check if target subdirectories exist
if (length(subdirs) == 0) {
cat("None of the target subdirectories (r2_01, r2_001, r2_0001) were found. Exiting.\n")
q()
}
# Find protein folders within target subdirectories
protein_folders <- unlist(lapply(subdirs, function(subdir) {
folders <- list.dirs(subdir, recursive = FALSE, full.names = TRUE)
# Filter folders to match the expected naming pattern (e.g., GENE_UniProtID_OID)
folders[grepl("^[A-Z0-9]+_[A-Z][0-9]{5}_OID[0-9]+$", basename(folders))]
}))
# Remove NA or empty entries
protein_folders <- protein_folders[!is.na(protein_folders) & protein_folders != ""]
if (length(protein_folders) == 0) {
cat("No valid protein folders found in subdirectories", paste(target_subdirs, collapse = ", "), ". Exiting.\n")
q()
}
# Identify protein folders without harmonized files
missing_proteins <- protein_folders[!sapply(protein_folders, function(dir) {
any(list.files(dir, pattern = "tss_harmonized_.*\\.csv$", full.names = TRUE))
})]
# Remove NA or invalid entries
missing_proteins <- missing_proteins[!is.na(missing_proteins) & missing_proteins != ""]
# Summarize results
cat("Total protein folders in", paste(target_subdirs, collapse = ", "), ":", length(protein_folders), "\n")
cat("Protein folders without harmonized files:", length(missing_proteins), "\n")
if (length(missing_proteins) > 0) {
cat("Sample of missing protein folders:", paste(head(basename(missing_proteins), 3), collapse = ", "), "\n")
# Break down by subdirectory
missing_by_subdir <- sapply(target_subdirs, function(subdir) {
subdir_path <- file.path(results_dir, subdir)
sum(grepl(subdir_path, missing_proteins, fixed = TRUE))
})
cat("\nBreakdown of missing protein folders by subdirectory:\n")
for (i in seq_along(target_subdirs)) {
cat(target_subdirs[i], ":", missing_by_subdir[i], "\n")
}
} else {
cat("All protein folders have harmonized files.\n")
}
rm(list = ls())
# Load libraries
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
library(foreach)
library(doParallel)
library(UniProt.ws)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set memory limit
Sys.setenv("R_MAX_VSIZE" = 128000)
cat("Max vector size set to:", Sys.getenv("R_MAX_VSIZE"), "MB\n")
# Optimize BLAS for single-threaded performance
blas_set_num_threads(1)
# Set directories and files
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
data_dir <- file.path(base_dir, "data/fixed")
results_dir <- file.path(base_dir, "results_250KB_tss")
clump_dirs <- c("r2_0001", "r2_001", "r2_01")
# Create subdirectories for each clump value
for (dir in clump_dirs) {
dir_path <- file.path(results_dir, dir)
if (!dir.exists(dir_path)) dir.create(dir_path, recursive = TRUE)
}
# Check if data_dir exists
if (!dir.exists(data_dir)) {
stop(sprintf("Data directory %s does not exist. Please verify the path.", data_dir))
}
# Outcome configuration for SBP
outcome_config <- list(
name = "SBP",
file = file.path(base_dir, "ieu-b-38_formatted.tsv"),
id = "ieu-b-38"
)
# Verify outcome file exists
if (!file.exists(outcome_config$file)) {
stop(sprintf("Outcome file %s does not exist. Please verify the file name and path.", outcome_config$file))
}
# Annotation file
local_annotation_file <- file.path(base_dir, "mart_clean_37_alias.txt")
if (!file.exists(local_annotation_file)) {
stop(sprintf("Local annotation file %s not found", local_annotation_file))
}
# Load annotation file
gene_data <- fread(local_annotation_file, data.table = FALSE)
cat(sprintf("Loaded local annotation file with %d genes\n", nrow(gene_data)))
# List all protein files
protein_files <- list.files(data_dir, pattern = "^[A-Z0-9]+_[A-Z0-9]+_OID[0-9]+_v1_.*\\.gz$", full.names = TRUE)
cat("Total proteins to process:", length(protein_files), "\n")
# Function for harmonization (kept exactly as in your script)
harmonize_protein <- function(protein_file, outcome_file, results_dir, clump_dirs, gene_data) {
olink_id <- sub("_v1_.*$", "", basename(protein_file))
cat(sprintf("Processing %s...\n", olink_id))
# Get TSS and chromosome
parts <- strsplit(olink_id, "_")[[1]]
protein <- parts[1]
protein_data <- gene_data %>%
filter(tolower(`Gene name`) == tolower(protein) | tolower(`Gene Synonym`) == tolower(protein))
if (nrow(protein_data) == 0) {
cat(sprintf("No TSS data for %s in annotation file; skipping\n", olink_id))
return(NULL)
}
protein_chrom <- as.character(protein_data$`Chromosome/scaffold name`[1])
protein_tss <- as.numeric(protein_data$`Transcription start site (TSS)`[1])
window_size <- 250000
tss_start <- protein_tss - window_size
tss_end <- protein_tss + window_size
# Format exposure data
exp_dat <- fread(protein_file) %>%
filter(
CHROM == protein_chrom,
POS19 >= tss_start,
POS19 <= tss_end,
!is.na(rsid),
grepl("^rs", rsid)
) %>%
dplyr::select(
SNP = rsid,
chr.exposure = CHROM,
pos.exposure = POS19,
effect_allele.exposure = ALLELE1,
other_allele.exposure = ALLELE0,
eaf.exposure = A1FREQ,
beta.exposure = BETA,
se.exposure = SE,
pval.exposure = P,
samplesize.exposure = N
) %>%
mutate(
exposure = olink_id,
id.exposure = olink_id,
chr.exposure = as.character(chr.exposure)
)
# Format outcome data
out_dat <- fread(outcome_file) %>%
filter(
CHR == protein_chrom,
BP >= tss_start,
BP <= tss_end,
!is.na(SNP),
grepl("^rs", SNP)
) %>%
dplyr::select(
SNP = SNP,
chr.outcome = CHR,
pos.outcome = BP,
effect_allele.outcome = ALT,
other_allele.outcome = REF,
eaf.outcome = EAF,
beta.outcome = BETA,
se.outcome = SE,
pval.outcome = PVAL
) %>%
mutate(
outcome = outcome_config$name,
id.outcome = outcome_config$id,
chr.outcome = as.character(chr.outcome),
samplesize.outcome = 757601
)
# Harmonization step
dat <- tryCatch({
harmonise_data(exposure_dat = exp_dat, outcome_dat = out_dat, action = 1)
}, error = function(e) {
cat(sprintf("Harmonization failed for %s: %s\n", olink_id, e$message))
return(NULL)
})
if (is.null(dat) || nrow(dat) == 0) {
cat(sprintf("No harmonized data for %s; skipping\n", olink_id))
return(NULL)
}
# Save harmonized data to each LD clump directory
for (dir in clump_dirs) {
clump_dir <- file.path(results_dir, dir, olink_id)
if (!dir.exists(clump_dir)) dir.create(clump_dir, recursive = TRUE)
harmonized_file <- file.path(clump_dir, sprintf("tss_harmonized_%s_%s.csv", olink_id, outcome_config$name))
write.csv(dat, harmonized_file, row.names = FALSE)
cat(sprintf("Harmonized data saved for %s in %s: %s\n", olink_id, dir, harmonized_file))
}
}
# Parallel setup
num_cores <- min(detectCores() - 1, 16)
registerDoParallel(cores = num_cores)
cat(sprintf("Using %d cores for parallel processing\n", num_cores))
# Apply harmonization on all protein files in parallel
foreach(file = protein_files, .packages = c("dplyr", "data.table", "TwoSampleMR")) %dopar% {
harmonize_protein(file, outcome_config$file, results_dir, clump_dirs, gene_data)
}
cat("Harmonization process completed for all proteins in results_250KB_tss.\n")
Rscript mr_no_harm_parallel_250KB.R
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
library(parallel)
library(doParallel)
library(foreach)
# Define %||% operator for fallback
`%||%` <- function(x, y) if (length(x) > 0) x else y
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
input_dir <- file.path(base_dir, "results_250KB_tss") # Source data directory
results_dir <- file.path(base_dir, "results_250KB_tss", "parallel") # Output directory
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
cat("Created results directory:", results_dir, "\n")
} else {
cat("Results directory exists:", results_dir, "\n")
}
# LD clumping thresholds and corresponding directories
clump_r2_values <- c(0.1, 0.01, 0.001)
clump_labels <- c("r2_01", "r2_001", "r2_0001")
# Create r2 subdirectories in output directory
for (clump_label in clump_labels) {
clump_dir <- file.path(results_dir, clump_label)
if (!dir.exists(clump_dir)) {
dir.create(clump_dir, recursive = TRUE)
cat("Created clump directory:", clump_dir, "\n")
}
}
# Outcome configuration
outcome_config <- list(name = "SBP", id = "ieu-b-38")
# Set up logging
log_file <- file.path(base_dir, "mr_analysis_full_parallel.log")
sink(log_file, append = TRUE, split = TRUE)
cat(sprintf("Starting parallel MR analysis on %s\n", Sys.time()))
# Function to perform MR analysis
perform_mr <- function(harmonized_file, protein_dir, olink_id, clump_r2, clump_label) {
today <- Sys.Date()
# Use results_dir for output (create protein_dir in parallel/r2_*/protein/)
output_protein_dir <- file.path(results_dir, clump_label, olink_id)
if (!dir.exists(output_protein_dir)) {
dir.create(output_protein_dir, recursive = TRUE)
}
clumped_file <- file.path(output_protein_dir, sprintf("clumped_%s_%s_r2%s_%s.csv", olink_id, outcome_config$name, clump_label, today))
excel_file <- file.path(output_protein_dir, sprintf("MR_Results_%s_%s_r2%s_%s.xlsx", olink_id, outcome_config$name, clump_label, today))
plot_file <- file.path(output_protein_dir, sprintf("MR_Forestplot_%s_%s_r2%s_%s.png", olink_id, outcome_config$name, clump_label, today))
if (file.exists(clumped_file) || file.exists(excel_file) || file.exists(plot_file)) {
cat(sprintf("Skipping %s (r2 = %s): Existing files found\n", olink_id, clump_r2))
return(NULL)
}
cat(sprintf("Reading harmonized file: %s\n", harmonized_file))
dat <- fread(harmonized_file)
if (nrow(dat) == 0) {
cat(sprintf("No data in harmonized file for %s; skipping\n", olink_id))
return(NULL)
}
instruments <- dat %>%
filter(mr_keep == TRUE, pval.exposure < 5e-6) %>%
mutate(
rsid = SNP,
pval = pval.exposure,
id = sprintf("%s exposure", olink_id),
F_stat = (beta.exposure / se.exposure)^2
) %>%
filter(F_stat > 10)
cat(sprintf("Number of instruments for %s: %d\n", olink_id, nrow(instruments)))
if (nrow(instruments) == 0) {
cat(sprintf("No valid instruments for %s; skipping\n", olink_id))
return(NULL)
}
instruments <- steiger_filtering(instruments)
if (!"steiger_dir" %in% colnames(instruments) || nrow(instruments) == 0) {
cat(sprintf("Steiger filtering failed for %s; skipping\n", olink_id))
return(NULL)
}
instruments <- instruments %>% filter(steiger_dir == TRUE)
cat(sprintf("Instruments after Steiger filtering for %s: %d\n", olink_id, nrow(instruments)))
if (nrow(instruments) == 0) return(NULL)
clumped_dat <- NULL
tryCatch({
clumped <- ld_clump(
dplyr::tibble(rsid = instruments$rsid, pval = instruments$pval, id = instruments$id),
plink_bin = genetics.binaRies::get_plink_binary(),
bfile = "/Users/charleenadams/1000G_bfiles/EUR/EUR",
clump_r2 = clump_r2,
clump_kb = 500
)
clumped_dat <- instruments %>% filter(SNP %in% clumped$rsid)
cat(sprintf("SNPs after clumping for %s (r2 = %s): %d\n", olink_id, clump_r2, nrow(clumped_dat)))
write.csv(clumped_dat, clumped_file, row.names = FALSE)
}, error = function(e) {
cat(sprintf("LD clumping failed for %s (r2 = %s): %s\n", olink_id, clump_r2, e$message))
return(NULL)
})
if (is.null(clumped_dat) || nrow(clumped_dat) == 0) {
cat(sprintf("No SNPs after clumping for %s (r2 = %s); skipping\n", olink_id, clump_r2))
return(NULL)
}
ivw_result <- mr(clumped_dat, method_list = "mr_ivw")
egger_result <- if (nrow(clumped_dat) >= 3) {
tryCatch(mr(clumped_dat, method_list = "mr_egger_regression"), error = function(e) NULL)
} else NULL
weighted_median_result <- mr(clumped_dat, method_list = "mr_weighted_median")
mr_input <- mr_input(
bx = clumped_dat$beta.exposure,
bxse = clumped_dat$se.exposure,
by = clumped_dat$beta.outcome,
byse = clumped_dat$se.outcome,
exposure = olink_id,
outcome = outcome_config$name,
snps = clumped_dat$SNP
)
lasso_result <- tryCatch(mr_lasso(mr_input), error = function(e) {
cat(sprintf("MR-Lasso failed for %s: %s\n", olink_id, e$message)); NULL
})
conmix_result <- tryCatch(mr_conmix(mr_input), error = function(e) {
cat(sprintf("MR-ConMix failed for %s: %s\n", olink_id, e$message)); NULL
})
heterogeneity_result <- mr_heterogeneity(clumped_dat)
pleiotropy_result <- if (nrow(clumped_dat) >= 3) mr_pleiotropy_test(clumped_dat) else NULL
loo_result <- mr_leaveoneout(clumped_dat)
wald_ratios <- clumped_dat %>%
mutate(
wald_beta = beta.outcome / beta.exposure,
wald_se = sqrt((se.outcome^2 / beta.exposure^2) + ((beta.outcome^2 * se.exposure^2) / (beta.exposure^4))),
pval = 2 * pnorm(abs(wald_beta / wald_se), lower.tail = FALSE),
method = paste("Wald Ratio:", SNP)
) %>%
dplyr::select(SNP, wald_beta, wald_se, pval, method)
mr_results <- bind_rows(
if (nrow(ivw_result) > 0) ivw_result[, c("method", "b", "se", "pval")],
if (!is.null(egger_result) && nrow(egger_result) > 0) egger_result[, c("method", "b", "se", "pval")],
if (nrow(weighted_median_result) > 0) weighted_median_result[, c("method", "b", "se", "pval")],
if (!is.null(lasso_result)) data.frame(
method = "MR-Lasso",
b = lasso_result@Estimate,
se = lasso_result@StdError,
pval = lasso_result@Pvalue
),
if (!is.null(conmix_result)) data.frame(
method = "MR-ConMix",
b = conmix_result@Estimate,
se = (conmix_result@CIUpper - conmix_result@CILower)/(2*1.96),
pval = conmix_result@Pvalue
),
wald_ratios %>% dplyr::select(method, b = wald_beta, se = wald_se, pval)
) %>%
mutate(
method = factor(method, levels = unique(method)),
or = exp(b),
or_lower = exp(b - 1.96 * se),
or_upper = exp(b + 1.96 * se)
) %>%
na.omit()
base_colors <- c("IVW" = "#1F78B4", "MR-Egger" = "#FF7F00", "Weighted Median" = "#33A02C",
"MR-Lasso" = "#FB9A99", "MR-ConMix" = "#E41A1C")
wald_methods <- unique(mr_results$method[grepl("Wald Ratio", mr_results$method)])
wald_colors <- if (length(wald_methods) > 0) setNames(hue_pal()(length(wald_methods)), wald_methods) else NULL
color_list <- c(base_colors, wald_colors)
color_list <- color_list[names(color_list) %in% unique(mr_results$method)]
x_min <- min(mr_results$or_lower, na.rm = TRUE)
x_max <- max(mr_results$or_upper, na.rm = TRUE)
protein <- strsplit(olink_id, "_")[[1]][1]
forest_plot <- ggplot(mr_results, aes(x = or, y = method, color = method)) +
geom_errorbarh(aes(xmin = or_lower, xmax = or_upper), height = 0.2, na.rm = TRUE, linewidth = 0.8) +
geom_point(size = 3) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
labs(
title = sprintf("Mendelian Randomization Estimates:\n %s on %s (r2 = %s)", protein, outcome_config$name, clump_r2),
x = "Causal Effect (Beta)",
y = ""
) +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
legend.position = "none",
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA)
) +
scale_color_manual(values = color_list) +
xlim(x_min - 0.02 * (x_max - x_min), x_max + 0.02 * (x_max - x_min))
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || !is.data.frame(data) || nrow(data) == 0 || ncol(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
return()
}
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, data, startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
}
toc_data <- data.frame(
Sheet = c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out",
"Instruments", "Clumped_Data"),
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis",
"Filtered Instruments", "Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
add_formatted_sheet(wb, "IVW", ivw_result, "Inverse Variance Weighted Results")
add_formatted_sheet(wb, "MR-Egger", egger_result, "MR-Egger Results")
add_formatted_sheet(wb, "Weighted_Median", weighted_median_result, "Weighted Median Results")
if (!is.null(lasso_result)) {
lasso_df <- data.frame(
Exposure = lasso_result@Exposure,
Outcome = lasso_result@Outcome,
Estimate = lasso_result@Estimate,
StdError = lasso_result@StdError,
CILower = lasso_result@CILower,
CIUpper = lasso_result@CIUpper,
Pvalue = lasso_result@Pvalue,
SNPs = lasso_result@SNPs,
Valid = lasso_result@Valid,
ValidSNPs = paste(lasso_result@ValidSNPs, collapse = ", ")
)
add_formatted_sheet(wb, "MR-Lasso", lasso_df, "MR-Lasso Results")
}
if (!is.null(conmix_result)) {
conmix_df <- data.frame(
Exposure = conmix_result@Exposure,
Outcome = conmix_result@Outcome,
Estimate = conmix_result@Estimate,
Pvalue = conmix_result@Pvalue,
CILower = conmix_result@CILower,
CIUpper = conmix_result@CIUpper,
SNPs = conmix_result@SNPs
)
add_formatted_sheet(wb, "MR-ConMix", conmix_df, "MR-ConMix Results")
}
add_formatted_sheet(wb, "Wald_Ratios", wald_ratios, "Wald Ratio Tests")
add_formatted_sheet(wb, "Heterogeneity_Test", heterogeneity_result, "Heterogeneity Test Results")
add_formatted_sheet(wb, "Pleiotropy_Test", pleiotropy_result, "Pleiotropy Test Results")
add_formatted_sheet(wb, "Leave_One_Out", loo_result, "Leave-One-Out Analysis")
add_formatted_sheet(wb, "Instruments", instruments, "Filtered Instruments")
add_formatted_sheet(wb, "Clumped_Data", clumped_dat, "Clumped SNPs Data")
saveWorkbook(wb, excel_file, overwrite = FALSE)
ggsave(plot_file, forest_plot, width = 12, height = 10, dpi = 600, bg = "white")
cat(sprintf("Results for %s (r2 = %s) saved to: %s and %s\n", olink_id, clump_r2, excel_file, plot_file))
return(list(
ivw = ivw_result, egger = egger_result, weighted_median = weighted_median_result,
lasso = lasso_result, conmix = conmix_result, wald_ratios = wald_ratios,
heterogeneity = heterogeneity_result, pleiotropy = pleiotropy_result,
loo = loo_result, instruments = instruments, clumped = clumped_dat,
protein_id = olink_id, r2_value = clump_label
))
}
# Set up parallel processing
num_cores <- min(15, detectCores())
cat(sprintf("Setting up parallel processing with %d cores\n", num_cores))
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Export necessary packages and objects to cluster
clusterExport(cl, c("perform_mr", "outcome_config", "%||%", "results_dir"))
clusterEvalQ(cl, {
library(TwoSampleMR)
library(MendelianRandomization)
library(ieugwasr)
library(MRInstruments)
library(dplyr)
library(tidyr)
library(data.table)
library(openxlsx)
library(ggplot2)
library(scales)
library(RhpcBLASctl)
library(patchwork)
library(genetics.binaRies)
})
# Create task list (read from input_dir, write to results_dir)
tasks <- list()
for (i in seq_along(clump_r2_values)) {
clump_r2 <- clump_r2_values[i]
clump_label <- clump_labels[i]
clump_dir <- file.path(input_dir, clump_label) # Read from input_dir
cat(sprintf("Checking directory: %s\n", clump_dir))
if (!dir.exists(clump_dir)) {
cat(sprintf("Directory %s does not exist; skipping\n", clump_dir))
next
}
protein_dirs <- list.dirs(clump_dir, recursive = FALSE, full.names = FALSE)
cat(sprintf("Protein directories in %s:\n", clump_dir))
print(protein_dirs)
for (protein in protein_dirs) {
protein_dir <- file.path(clump_dir, protein)
cat(sprintf("Checking protein directory: %s\n", protein_dir))
if (!dir.exists(protein_dir)) {
cat(sprintf("Protein directory %s does not exist; skipping\n", protein_dir))
next
}
all_files <- list.files(protein_dir, full.names = TRUE)
cat(sprintf("Files in %s:\n", protein_dir))
print(all_files)
harmonized_file <- list.files(protein_dir, pattern = sprintf("tss_harmonized_%s_%s.*\\.csv$", protein, outcome_config$name), full.names = TRUE)
cat(sprintf("Searching for harmonized file with pattern: tss_harmonized_%s_%s.*\\.csv\n", protein, outcome_config$name))
cat(sprintf("Found harmonized files:\n"))
print(harmonized_file)
if (length(harmonized_file) == 0) {
cat(sprintf("No harmonized file found for %s in %s; skipping\n", protein, clump_label))
next
}
tasks <- c(tasks, list(list(
harmonized_file = harmonized_file[1],
protein_dir = protein_dir,
olink_id = protein,
clump_r2 = clump_r2,
clump_label = clump_label
)))
}
}
# Run MR analyses in parallel
cat(sprintf("Starting parallel execution of %d tasks\n", length(tasks)))
master_data <- foreach(task = tasks, .packages = c("TwoSampleMR", "MendelianRandomization", "ieugwasr", "MRInstruments", "dplyr", "tidyr", "data.table", "openxlsx", "ggplot2", "scales", "RhpcBLASctl", "patchwork", "genetics.binaRies")) %dopar% {
result <- perform_mr(
harmonized_file = task$harmonized_file,
protein_dir = task$protein_dir,
olink_id = task$olink_id,
clump_r2 = task$clump_r2,
clump_label = task$clump_label
)
result
}
# Remove NULL results
master_data <- master_data[!sapply(master_data, is.null)]
# Stop the cluster
stopCluster(cl)
cat("Parallel cluster stopped\n")
Rscript consolidate_results_250KB.R
rm(list = ls())
# Load libraries
library(openxlsx)
library(dplyr)
library(data.table)
library(parallel)
library(doParallel)
library(foreach)
library(R.utils)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
results_dir <- file.path(base_dir, "results_250KB_tss", "parallel")
log_file <- file.path(base_dir, "mr_master_file_creation_250KB.log")
# Set up logging
sink(log_file, append = TRUE, split = TRUE)
cat(sprintf("Starting master file creation on %s\n", Sys.time()))
# Validate results directory
if (!dir.exists(results_dir)) {
cat("Error: Results directory does not exist:", results_dir, "\n")
sink()
stop("Results directory does not exist")
}
# Define tabs
tabs <- c("IVW", "MR-Egger", "Weighted_Median", "MR-Lasso", "MR-ConMix",
"Wald_Ratios", "Heterogeneity_Test", "Pleiotropy_Test", "Leave_One_Out", "Clumped_Data")
# Initialize master_data_combined as data.tables
master_data_combined <- setNames(lapply(tabs, function(x) data.table()), tabs)
# Find all MR_Results_*.xlsx files
cat("Searching for MR_Results_*.xlsx files in", results_dir, "\n")
result_files <- list.files(
path = results_dir,
pattern = "MR_Results_.*\\.xlsx$",
recursive = TRUE,
full.names = TRUE
)
cat("Found", length(result_files), "results files\n")
# Function to process a single file
process_file <- function(file) {
local_data <- setNames(lapply(tabs, function(x) data.table()), tabs)
path_parts <- strsplit(dirname(file), .Platform$file.sep)[[1]]
protein_id <- path_parts[length(path_parts)]
r2_value <- path_parts[length(path_parts) - 1]
cat(sprintf("Processing file: %s (protein: %s, r2: %s)\n", basename(file), protein_id, r2_value))
tryCatch({
for (tab in tabs) {
data <- tryCatch({
withTimeout({
openxlsx::read.xlsx(file, sheet = tab, startRow = 3, detectDates = TRUE)
}, timeout = 10)
}, TimeoutException = function(e) {
cat(sprintf("Timeout reading sheet %s from %s\n", tab, basename(file)))
NULL
}, error = function(e) {
cat(sprintf("Error reading sheet %s from %s: %s\n", tab, basename(file), e$message))
NULL
})
if (is.null(data) || nrow(data) == 0) {
cat(sprintf("Skipping tab %s for %s: No data\n", tab, protein_id))
next
}
if (!is.data.frame(data)) {
cat(sprintf("Warning: Sheet %s in %s is not a data frame. Skipping.\n", tab, basename(file)))
next
}
data <- as.data.table(data)
data$Protein_ID <- protein_id
data$Clump_R2 <- r2_value
local_data[[tab]] <- data
}
local_data
}, error = function(e) {
cat(sprintf("Error processing file %s: %s\n", basename(file), e$message))
NULL
})
}
# Process files (parallel with fallback to sequential)
cat("Attempting parallel processing with 4 cores\n")
num_cores <- min(4, detectCores())
cl <- tryCatch({
makeCluster(num_cores)
}, error = function(e) {
cat("Error setting up parallel cluster: ", e$message, "\n")
NULL
})
if (!is.null(cl)) {
registerDoParallel(cl)
clusterEvalQ(cl, {
library(openxlsx)
library(dplyr)
library(data.table)
library(R.utils)
})
batch_size <- ceiling(length(result_files) / num_cores)
batches <- split(result_files, ceiling(seq_along(result_files) / batch_size))
cat(sprintf("Processing %d files in %d batches\n", length(result_files), length(batches)))
results <- foreach(batch = batches, .packages = c("openxlsx", "dplyr", "data.table", "R.utils")) %dopar% {
batch_results <- lapply(batch, process_file)
batch_results
}
results <- unlist(results, recursive = FALSE)
stopCluster(cl)
cat("Parallel cluster stopped\n")
} else {
cat("Falling back to sequential processing\n")
results <- lapply(result_files, process_file)
}
# Merge results
cat("Merging results\n")
for (result in results) {
if (is.null(result)) next
for (tab in tabs) {
if (!is.null(result[[tab]]) && is.data.table(result[[tab]]) && nrow(result[[tab]]) > 0) {
tryCatch({
# Validate column names
if (nrow(master_data_combined[[tab]]) > 0) {
if (!all(names(result[[tab]]) %in% names(master_data_combined[[tab]]))) {
cat(sprintf("Warning: Column mismatch for tab %s. Adjusting columns.\n", tab))
result[[tab]] <- result[[tab]][, names(master_data_combined[[tab]]), with = FALSE]
}
}
master_data_combined[[tab]] <- rbindlist(list(master_data_combined[[tab]], result[[tab]]), use.names = TRUE, fill = TRUE)
cat(sprintf("Merged %d rows for tab %s\n", nrow(result[[tab]]), tab))
}, error = function(e) {
cat(sprintf("Error in rbindlist for tab %s: %s\n", tab, e$message))
})
}
}
gc() # Clean up memory after each result
}
# Create workbook
wb <- createWorkbook()
title_style <- createStyle(fontSize = 14, fontColour = "black", textDecoration = "bold", halign = "center")
header_style <- createStyle(fontColour = "white", fgFill = "#C8A2C8", textDecoration = "bold", halign = "center", border = "TopBottomLeftRight")
# Function to add formatted sheet
add_formatted_sheet <- function(wb, sheet_name, data, title) {
if (is.null(data) || nrow(data) == 0) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, "No data available")
cat(sprintf("Added empty sheet: %s\n", sheet_name))
return()
}
tryCatch({
addWorksheet(wb, sheet_name)
writeData(wb, sheet_name, title, startRow = 1, startCol = 1)
writeData(wb, sheet_name, as.data.frame(data), startRow = 3, startCol = 1, headerStyle = header_style)
mergeCells(wb, sheet_name, cols = 1:ncol(data), rows = 1)
setColWidths(wb, sheet_name, cols = 1:ncol(data), widths = "auto")
addStyle(wb, sheet_name, title_style, rows = 1, cols = 1)
cat(sprintf("Added sheet: %s with %d rows\n", sheet_name, nrow(data)))
}, error = function(e) {
cat(sprintf("Error adding sheet %s: %s\n", sheet_name, e$message))
})
}
# Create TOC and other sheets
tryCatch({
toc_data <- data.frame(
Sheet = tabs,
Title = c("Inverse Variance Weighted Results", "MR-Egger Results",
"Weighted Median Results", "MR-Lasso Results", "MR-ConMix Results",
"Wald Ratio Tests", "Heterogeneity Test Results",
"Pleiotropy Test Results", "Leave-One-Out Analysis","Clumped SNPs Data")
)
addWorksheet(wb, "TOC")
writeData(wb, "TOC", "Table of Contents", startRow = 1, startCol = 1)
mergeCells(wb, "TOC", cols = 1:2, rows = 1)
addStyle(wb, "TOC", title_style, rows = 1, cols = 1)
writeData(wb, "TOC", toc_data, startRow = 3, startCol = 1, headerStyle = header_style)
setColWidths(wb, "TOC", cols = 1:2, widths = "auto")
cat("Added TOC sheet\n")
for (tab in tabs) {
add_formatted_sheet(wb, tab, master_data_combined[[tab]], toc_data$Title[tabs == tab])
}
}, error = function(e) {
cat("Error creating TOC or sheets: ", e$message, "\n")
})
# Save workbook
today <- Sys.Date()
master_file <- file.path(results_dir, sprintf("Master_MR_Results_SBP_250KB%s.xlsx", today))
tryCatch({
saveWorkbook(wb, master_file, overwrite = TRUE)
cat(sprintf("Master file saved to: %s\n", master_file))
}, error = function(e) {
cat(sprintf("Error saving master file %s: %s\n", master_file, e$message))
})
cat(sprintf("Master file creation completed on %s\n", Sys.time()))
sink()
# Clear environment
rm(list = ls())
# Load libraries (minimal set needed for this task)
library(dplyr)
# Set directories
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
results_dir <- file.path(base_dir, "results_250KB_tss")
# Define the subdirectories to process
target_subdirs <- c("r2_01", "r2_001", "r2_0001")
# Find subdirectories, keeping only the target ones
subdirs <- list.dirs(results_dir, recursive = FALSE, full.names = TRUE)
subdirs <- subdirs[basename(subdirs) %in% target_subdirs]
# Check if target subdirectories exist
Rscript clean_r2_merge_results.R
Largely works. Just clean up the TOC sheet manually.
library(openxlsx)
library(dplyr)
library(data.table)
# File paths
base_dir <- "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp"
file_250KB <- file.path(base_dir, "results_250KB_tss/parallel/Master_MR_Results_SBP_250KB2025-05-11.xlsx")
file_500KB <- file.path(base_dir, "results_500KB_tss/parallel/Master_MR_Results_SBP_500KB.xlsx")
merged_file <- file.path(base_dir, "Master_MR_Results_Merged_250KB_500KB.xlsx")
# Load both Excel files
wb_250KB <- loadWorkbook(file_250KB)
wb_500KB <- loadWorkbook(file_500KB)
# Extract sheet names (same for both files)
sheet_names <- names(wb_250KB)
# Create a new workbook for the merged file
wb_merged <- createWorkbook()
# Define styles
title_style <- createStyle(
fontSize = 12,
fontColour = "black",
textDecoration = "bold",
halign = "center"
)
header_style <- createStyle(
fontColour = "white",
fgFill = "#C8A2C8",
textDecoration = "bold",
halign = "center"
)
# Function to process a sheet from each file
process_sheet <- function(wb, sheet_name, TSS_region_value) {
data <- read.xlsx(wb, sheet = sheet_name, startRow = 3) # Reading from row 3
# If the sheet is empty, return an empty data.table
if (is.null(data) || nrow(data) == 0) {
return(data.table())
}
# Ensure Clump_R2 column exists and update values
if ("Clump_R2" %in% colnames(data)) {
data$Clump_R2 <- gsub("r2_01", "r2_0.1", data$Clump_R2)
data$Clump_R2 <- gsub("r2_001", "r2_0.01", data$Clump_R2)
data$Clump_R2 <- gsub("r2_0001", "r2_0.001", data$Clump_R2)
} else {
data$Clump_R2 <- NA
}
# Add TSS_region column
data$TSS_region <- TSS_region_value
return(data)
}
# Mapping for clean sheet titles
clean_sheet_names <- list(
"IVW" = "Inverse Variance Weighted (IVW)",
"MR-Egger" = "MR Egger",
"Weighted_Median" = "Weighted Median",
"MR-Lasso" = "MR Lasso",
"MR-ConMix" = "MR ConMix",
"Wald_Ratios" = "Wald Ratios",
"Heterogeneity_Test" = "Heterogeneity Test",
"Pleiotropy_Test" = "Pleiotropy Test",
"Leave_One_Out" = "Leave One Out",
"Clumped_Data" = "Clumped SNPs Data"
)
# Process and merge each sheet
for (sheet in sheet_names) {
# Read and process data from both files
data_250KB <- process_sheet(wb_250KB, sheet, "250KB")
data_500KB <- process_sheet(wb_500KB, sheet, "500KB")
# Combine data, even if one is empty
combined_data <- rbindlist(list(data_250KB, data_500KB), fill = TRUE)
# Add the combined data to the merged workbook
sheet_title <- clean_sheet_names[[sheet]] %||% sheet
addWorksheet(wb_merged, sheet_title)
writeData(wb_merged, sheet_title, sheet_title, startRow = 1)
mergeCells(wb_merged, sheet_title, cols = 1:ncol(combined_data), rows = 1)
addStyle(wb_merged, sheet_title, title_style, rows = 1, cols = 1)
writeData(wb_merged, sheet_title, combined_data, startRow = 3, colNames = TRUE, headerStyle = header_style)
cat(sprintf("Processed and merged sheet: %s\n", sheet))
}
# Create the TOC sheet without TSS or Clump columns
addWorksheet(wb_merged, "Table of Contents (TOC)")
toc_data <- data.frame(
Sheet = names(clean_sheet_names),
Title = unname(unlist(clean_sheet_names))
)
writeData(wb_merged, "Table of Contents (TOC)", "Table of Contents (TOC)", startRow = 1)
mergeCells(wb_merged, "Table of Contents (TOC)", cols = 1:2, rows = 1)
addStyle(wb_merged, "Table of Contents (TOC)", title_style, rows = 1, cols = 1)
writeData(wb_merged, "Table of Contents (TOC)", toc_data, startRow = 3, colNames = TRUE, headerStyle = header_style)
setColWidths(wb_merged, "Table of Contents (TOC)", cols = 1:2, widths = "auto")
# Save the merged workbook
saveWorkbook(wb_merged, merged_file, overwrite = TRUE)
cat(sprintf("Merged file saved to: %s\n", merged_file))
#setwd("/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/app") # must do be must be commented out to deploy
# Run with:
# library(shiny)
# runApp()
#
# library(rsconnect)
# deployApp(appDir = "/Volumes/LaCie/tahir/mr_ukbppp_all_sbp/app", appName = "mr_all_ukbppp_sbp")
# Install required packages if not already installed
if (!require(shiny)) install.packages("shiny")
if (!require(shinydashboard)) install.packages("shinydashboard")
if (!require(DT)) install.packages("DT")
if (!require(readxl)) install.packages("readxl")
if (!require(dplyr)) install.packages("dplyr")
if (!require(shinyWidgets)) install.packages("shinyWidgets")
# Load libraries
library(shiny)
library(shinydashboard)
library(DT)
library(readxl)
library(dplyr)
library(shinyWidgets)
# Define UI
ui <- fluidPage(
# Custom CSS for professional styling with taupe color scheme
tags$head(
tags$style(HTML("
body {
font-family: 'Arial', sans-serif;
background-color: #F5F4EF; /* Soft Beige Background */
color: #333333; /* Dark Gray Text */
}
#main_title {
color: #333333; /* Dark Gray */
font-weight: bold;
font-size: 2.5em;
margin-top: 20px;
}
#lab_link {
color: #483C32; /* Taupe */
font-size: 1.2em;
text-decoration: none;
transition: color 0.3s;
}
#lab_link:hover {
color: #3A2F26; /* Darker Taupe */
text-decoration: underline;
}
.download-box {
background-color: #FFFFFF; /* White */
border: 1px solid #DCDCDC; /* Light Gray Border */
border-radius: 8px;
padding: 20px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
margin-bottom: 20px;
text-align: center;
}
.download-btn {
background-color: #483C32; /* Taupe */
color: white;
font-weight: bold;
border-radius: 5px;
padding: 10px 20px;
transition: background-color 0.3s;
}
.download-btn:hover {
background-color: #3A2F26; /* Darker Taupe */
}
.data-table-container {
background-color: #FFFFFF; /* White */
border-radius: 8px;
padding: 20px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
margin-bottom: 20px;
}
#logo_container {
text-align: center;
margin-top: 30px;
margin-bottom: 30px;
}
#logo_image {
max-width: 300px;
height: auto;
}
.nav-tabs > li > a {
background-color: #EDEAE5; /* Light Beige */
color: #333333; /* Dark Gray */
border-radius: 5px 5px 0 0;
margin-right: 5px;
font-weight: bold;
}
.nav-tabs > li.active > a {
background-color: #483C32; /* Taupe */
color: white;
}
.error-message {
color: #D9534F; /* Soft Red */
font-weight: bold;
text-align: center;
margin: 20px;
}
"))
),
# Header Title & Lab Link
div(
style = "text-align: center; margin-bottom: 10px;",
h1(HTML("Cis-MR and Colocalization of UKB-PPP Proteins & SBP"), id = "main_title")
),
div(
style = "text-align: center; margin-bottom: 30px;",
tags$a(
href = "https://www.bidmc.org/research/research-by-department/medicine/cardiovascular-medicine/personal-genomics-and-cardiometabolic-disease",
target = "_blank",
"🔬 Lab",
id = "lab_link"
)
),
# Main layout with sidebar and main content
fluidRow(
# Left sidebar for download button
column(
width = 3,
div(
class = "download-box",
h4("Full Results & Clumped Instruments", style = "color: #333333;"),
downloadButton(
outputId = "downloadExcel",
label = "Download",
class = "download-btn"
)
)
),
# Main content with tabbed DataTables
column(
width = 9,
div(
class = "data-table-container",
uiOutput("excelTabs"),
textOutput("errorMessage")
)
)
),
# BIDMC Logo with Hyperlink (using the working snippet)
div(
id = "logo_container",
tags$a(
href = "https://www.bidmc.org/research/research-by-department/medicine/cardiovascular-medicine/personal-genomics-and-cardiometabolic-disease",
target = "_blank",
tags$img(
src = "BIDMC_HMS_Stacked-LockUp.png",
id = "logo_image",
alt = "Beth Israel Deaconess Medical Center Logo"
)
)
)
)
# Define Server
server <- function(input, output, session) {
# Path to Excel file
excel_file <- file.path("www", "Master_MR_Results_Merged_250KB_500KB.xlsx")
# Path to logo file for debugging
logo_file <- file.path("www", "BIDMC_HMS_Stacked-LockUp.png")
# Log file for debugging
log_file <- "shiny_error.log"
log_message <- function(msg) {
cat(paste0(Sys.time(), ": ", msg, "\n"), file = log_file, append = TRUE)
}
log_message("Starting Shiny app")
# Check if Excel file exists
output$errorMessage <- renderText({
if (!file.exists(excel_file)) {
msg <- paste("Error: Excel file not found at", excel_file)
log_message(msg)
return(msg)
}
log_message("Excel file found")
# Check if logo file exists
if (!file.exists(logo_file)) {
msg <- paste("Error: Logo file not found at", logo_file)
log_message(msg)
} else {
log_message("Logo file found")
}
""
})
# Define target sheets to display
target_sheets <- c("IVW", "MR Egger", "Weighted Median", "MR Lasso", "MR ConMix", "Wald Ratios", "Heterogeneity Test", "Pleiotropy Test")
# Read Excel sheet names
sheet_names <- reactive({
if (file.exists(excel_file)) {
tryCatch({
sheets <- excel_sheets(excel_file)
if (length(sheets) == 0) {
msg <- "Error: No sheets found in Excel file"
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
}
# Filter to only target sheets that exist
valid_sheets <- sheets[sheets %in% target_sheets]
if (length(valid_sheets) == 0) {
msg <- "Error: None of the target sheets found in Excel file"
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
}
log_message(paste("Found", length(valid_sheets), "target sheets:", paste(valid_sheets, collapse = ", ")))
valid_sheets
}, error = function(e) {
msg <- paste("Error reading Excel sheets:", e$message)
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
})
} else {
log_message("Excel file not found in sheet_names")
return(NULL)
}
})
# Reactive value to store data frames with rounded columns
excel_data <- reactive({
req(sheet_names())
tryCatch({
data_list <- lapply(sheet_names(), function(sheet) {
tryCatch({
df <- read_excel(excel_file, sheet = sheet, skip = 2)
if (nrow(df) == 0 || ncol(df) == 0) {
log_message(paste("Sheet", sheet, "is empty or has no columns after skip=2"))
return(NULL)
}
# Round columns containing 'b', 'se', 'stderror', 'p', 'pval', 'Estimate', 'CILower', 'CIUpper', 'Pvalue' (case-insensitive) to 4 decimal places
cols_to_round <- grep("^(egger_intercept|q_df|q_pval|q|b|se|stderror|p|pval|Estimate|CILower|CIUpper|Pvalue)$", names(df), ignore.case = TRUE, value = TRUE)
if (length(cols_to_round) > 0) {
df <- df %>%
mutate(across(all_of(cols_to_round), ~ round(as.numeric(.), 4)))
}
log_message(paste("Read sheet", sheet, "with", nrow(df), "rows and", ncol(df), "columns"))
df
}, error = function(e) {
log_message(paste("Error reading sheet", sheet, ":", e$message))
return(NULL)
})
})
# Filter out NULL (empty or errored) sheets
data_list <- data_list[!sapply(data_list, is.null)]
if (length(data_list) == 0) {
msg <- "Error: No target sheets contain valid data"
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
}
names(data_list) <- sheet_names()[!sapply(data_list, is.null)]
data_list
}, error = function(e) {
msg <- paste("Error reading Excel data:", e$message)
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
})
})
# Render tabset for Excel sheets
output$excelTabs <- renderUI({
req(excel_data())
data <- excel_data()
if (is.null(data) || length(data) == 0) {
log_message("No data available to display")
return(div(class = "error-message", "No valid data sheets available to display."))
}
valid_sheets <- names(data)
tabs <- lapply(seq_along(valid_sheets), function(i) {
tabPanel(
title = valid_sheets[i],
DTOutput(paste0("table_", i))
)
})
log_message(paste("Rendering tabset with", length(tabs), "tabs"))
do.call(tabsetPanel, tabs)
})
# Render DataTables for each sheet (no Excel button)
observe({
data <- excel_data()
req(data)
valid_sheets <- names(data)
lapply(seq_along(valid_sheets), function(i) {
output[[paste0("table_", i)]] <- renderDT({
tryCatch({
log_message(paste("Rendering table for sheet", valid_sheets[i]))
datatable(
data[[i]],
filter = "top",
options = list(
pageLength = 10,
autoWidth = TRUE,
scrollX = TRUE
),
class = 'cell-border stripe'
)
}, error = function(e) {
msg <- paste("Error rendering table for sheet", valid_sheets[i], ":", e$message)
log_message(msg)
output$errorMessage <- renderText(msg)
return(NULL)
})
})
})
})
# Download handler for Excel file
output$downloadExcel <- downloadHandler(
filename = function() {
"Master_MR_Results_Merged_250KB_500KB.xlsx"
},
content = function(file) {
if (file.exists(excel_file)) {
log_message("Downloading Excel file")
file.copy(excel_file, file)
} else {
msg <- paste("Excel file not found for download:", excel_file)
log_message(msg)
stop(msg)
}
}
)
}
# Run the Shiny App
shinyApp(ui = ui, server = server)