I partially hacked xQTLbiolinks—Jeremy’s suggestion for retrieving GTEx eQTLs and running eQTL-pQTL coloc—to run HyPrColoc on our cis-regions, linking PCSK9 pQTL and METSIM metQTL data.

It’s a hack because xQTLbiolinks is designed for one GWAS trait (e.g., one pQTL) with one or more eQTLs, not multiple GWAS traits. While it correctly identifies the same causal SNP as HyPrColoc, it doesn’t indicate which traits colocalize. I’ll keep refining it—this is a solid prelude to what Jeremy wants and a good way to get familiar with xQTLbiolinks’ utilities.

1 PCSK9 and METSIM metabolites

# Load required libraries
library(data.table)
library(hyprcoloc)
library(openxlsx)
library(DT)
library(knitr)
library(xQTLbiolinks)

# Load phenotype mapping data
phenotype_map <- fread("/Users/charleenadams/metsim_files/phenotypes.tsv")
print(names(phenotype_map))  # Confirm structure of phenotype_map
##  [1] "chrom"         "pos"           "ref"           "alt"          
##  [5] "rsids"         "nearest_genes" "pval"          "num_peaks"    
##  [9] "phenocode"     "category"      "phenostring"
# Define file path to harmonized results: note, this directory contains the files with at least one SNP P<5E-8
file_path <- "/Users/charleenadams/harmonized_results_fixed/harmonized_results_PCSK9.txt"

# Read the harmonized data
data <- fread(file_path)

# Remove all characters after and including the first `_`
data$id.outcome <- sub("_.*", "", data$id.outcome)
data$id.exposure <- sub("_.*", "", data$id.exposure)

# Ensure both data frames are data.tables
setDT(data)
setDT(phenotype_map)

# Merge phenostring from phenotype_map into data using id.outcome and phenocode
data <- merge(data, phenotype_map[, .(phenocode, phenostring)], 
              by.x = "id.outcome", by.y = "phenocode", all.x = TRUE)

# Rename the phenostring column to METSIM_name
setnames(data, "phenostring", "METSIM_name")

# Remove asterisks (*) from the METSIM_name column
data$METSIM_name <- gsub("\\*", "", data$METSIM_name)
cat("Data read successfully. Total rows:", nrow(data), "\n")
## Data read successfully. Total rows: 115838
# Ensure the data contains the required columns
required_columns <- c("SNP", "id.exposure", "METSIM_name", "beta.exposure", "beta.outcome", "se.exposure", "se.outcome")
if (!all(required_columns %in% colnames(data))) {
  stop("The harmonized data is missing required columns!")
}

# Summarize pval.outcome for each id.outcome
pval_summary <- data[, .(
  min_pval = min(pval.outcome, na.rm = TRUE),
  max_pval = max(pval.outcome, na.rm = TRUE),
  mean_pval = mean(pval.outcome, na.rm = TRUE),
  median_pval = median(pval.outcome, na.rm = TRUE),
  count = .N  # Count number of SNPs per id.outcome
), by = id.outcome]

# Display results
# print(pval_summary)
# pval_summary <- pval_summary[order(pval_summary$min_pval),]

# Extract unique traits and common SNPs
traits <- unique(c(data$id.exposure, data$METSIM_name))
common_snps <- unique(data$SNP)
cat("Number of traits:", length(traits), "Number of SNPs:", length(common_snps), "\n")
## Number of traits: 25 Number of SNPs: 4850
# Filter data to include only common SNPs
data <- data[SNP %in% common_snps]

# Reshape data to create a long table for matrix population
long_data <- rbindlist(list(
  data[, .(SNP, trait = id.exposure, beta = beta.exposure, se = se.exposure)],
  data[, .(SNP, trait = METSIM_name, beta = beta.outcome, se = se.outcome)]
), use.names = TRUE)

# Ensure no duplicate SNP-trait combinations
long_data <- unique(long_data, by = c("SNP", "trait"))
cat("Rows in reshaped long_data:", nrow(long_data), "\n")
## Rows in reshaped long_data: 120688
# Create matrices for betas and ses
betas_mat <- dcast(long_data, SNP ~ trait, value.var = "beta", fill = NA)
ses_mat <- dcast(long_data, SNP ~ trait, value.var = "se", fill = NA)
snps <- betas_mat$SNP

# Convert matrices to the appropriate format for hyprcoloc
rownames(betas_mat) <- betas_mat$SNP
rownames(ses_mat) <- ses_mat$SNP

betas_mat <- as.matrix(betas_mat[, -1, with = FALSE])
ses_mat <- as.matrix(ses_mat[, -1, with = FALSE])

# Restore the SNP column as rownames
rownames(betas_mat) <- snps
rownames(ses_mat) <- snps

# Confirm rownames are correctly set
head(rownames(betas_mat))
## [1] "rs1000364" "rs1002778" "rs1003323" "rs1003767" "rs1003768" "rs1003769"
cat("Dimensions of beta matrix:", dim(betas_mat), "\n")
## Dimensions of beta matrix: 4850 25
cat("Dimensions of SE matrix:", dim(ses_mat), "\n")
## Dimensions of SE matrix: 4850 25
# Check for rows with missing values
rows_with_na_betas <- apply(betas_mat, 1, function(row) any(is.na(row)))
rows_with_na_ses <- apply(ses_mat, 1, function(row) any(is.na(row)))

# Filter matrices to keep only rows with no missing values
betas_mat <- betas_mat[!rows_with_na_betas, , drop = FALSE]
ses_mat <- ses_mat[!rows_with_na_ses, , drop = FALSE]

cat("Dimensions of beta matrix:", dim(betas_mat), "\n")
## Dimensions of beta matrix: 4747 25
cat("Dimensions of SE matrix:", dim(ses_mat), "\n")
## Dimensions of SE matrix: 4747 25
# Ensure SNP filtering consistency
setDT(data)
snps_to_keep <- rownames(betas_mat)
data_subset <- data[SNP %in% snps_to_keep]
cat("Subset data dimensions:", dim(data_subset), "\n")
## Subset data dimensions: 113928 33
cat("Subset data dimensions:", dim(data), "\n")
## Subset data dimensions: 115838 33
# Save filtered data
fwrite(data_subset, "filtered_data.csv")

# Prepare GWAS and eQTL datasets for colocalization
gwasDF <- data_subset[, .(
  rsid = SNP,
  chr = paste0("chr", chr.exposure),
  position = pos.exposure,
  pValue = pval.exposure,
  maf = eaf.exposure,
  beta = beta.exposure,
  se = se.exposure
)]

qtlDF <- data_subset[, .(
  rsid = SNP,
  chr = paste0("chr", chr.outcome),
  position = pos.outcome,
  pValue = pval.outcome,
  maf = eaf.outcome,
  beta = beta.outcome,
  se = se.outcome
)]

# Ensure p-values are valid
gwasDF[pValue <= 0, pValue := 1e-50]
qtlDF[pValue <= 0, pValue := 1e-50]

# Run colocalization analysis
coloc_results <- xQTLanalyze_coloc_diy(
  gwasDF = gwasDF,
  qtlDF = qtlDF,
  method = "Both"  # Runs both coloc and hyprcoloc
)
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf 
##  2.84e-97  5.35e-12  5.31e-86  1.00e+00  3.04e-08 
## [1] "PP abf for shared variant: 3.04e-06%"
##    nsnps    PP.H0.abf    PP.H1.abf    PP.H2.abf PP.H3.abf    PP.H4.abf
##    <num>        <num>        <num>        <num>     <num>        <num>
## 1:  3313 2.841411e-97 5.348077e-12 5.312957e-86         1 3.041164e-08
##    candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob hypr_candidate_snp
##           <char>     <num>          <num>              <num>             <char>
## 1:   rs530804537 0.8517936              1                  1         rs11591147
##    hypr_posterior_explainedBySnp
##                            <num>
## 1:                             1
# Save results
if (!is.null(coloc_results)) {
  fwrite(coloc_results$coloc_Out_summary, "coloc_results_all_genes.csv")
}

# Print summary
print(coloc_results$coloc_Out_summary)
##    nsnps    PP.H0.abf    PP.H1.abf    PP.H2.abf PP.H3.abf    PP.H4.abf
##    <num>        <num>        <num>        <num>     <num>        <num>
## 1:  3313 2.841411e-97 5.348077e-12 5.312957e-86         1 3.041164e-08
##    candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob hypr_candidate_snp
##           <char>     <num>          <num>              <num>             <char>
## 1:   rs530804537 0.8517936              1                  1         rs11591147
##    hypr_posterior_explainedBySnp
##                            <num>
## 1:                             1