I partially hacked xQTLbiolinks—Jeremy’s suggestion for retrieving GTEx eQTLs and running eQTL-pQTL coloc—to run HyPrColoc on our cis-regions, linking PCSK9 pQTL and METSIM metQTL data.
It’s a hack because xQTLbiolinks is designed for one GWAS trait (e.g., one pQTL) with one or more eQTLs, not multiple GWAS traits. While it correctly identifies the same causal SNP as HyPrColoc, it doesn’t indicate which traits colocalize. I’ll keep refining it—this is a solid prelude to what Jeremy wants and a good way to get familiar with xQTLbiolinks’ utilities.
# Load required libraries
library(data.table)
library(hyprcoloc)
library(openxlsx)
library(DT)
library(knitr)
library(xQTLbiolinks)
# Load phenotype mapping data
phenotype_map <- fread("/Users/charleenadams/metsim_files/phenotypes.tsv")
print(names(phenotype_map)) # Confirm structure of phenotype_map
## [1] "chrom" "pos" "ref" "alt"
## [5] "rsids" "nearest_genes" "pval" "num_peaks"
## [9] "phenocode" "category" "phenostring"
# Define file path to harmonized results: note, this directory contains the files with at least one SNP P<5E-8
file_path <- "/Users/charleenadams/harmonized_results_fixed/harmonized_results_PCSK9.txt"
# Read the harmonized data
data <- fread(file_path)
# Remove all characters after and including the first `_`
data$id.outcome <- sub("_.*", "", data$id.outcome)
data$id.exposure <- sub("_.*", "", data$id.exposure)
# Ensure both data frames are data.tables
setDT(data)
setDT(phenotype_map)
# Merge phenostring from phenotype_map into data using id.outcome and phenocode
data <- merge(data, phenotype_map[, .(phenocode, phenostring)],
by.x = "id.outcome", by.y = "phenocode", all.x = TRUE)
# Rename the phenostring column to METSIM_name
setnames(data, "phenostring", "METSIM_name")
# Remove asterisks (*) from the METSIM_name column
data$METSIM_name <- gsub("\\*", "", data$METSIM_name)
cat("Data read successfully. Total rows:", nrow(data), "\n")
## Data read successfully. Total rows: 115838
# Ensure the data contains the required columns
required_columns <- c("SNP", "id.exposure", "METSIM_name", "beta.exposure", "beta.outcome", "se.exposure", "se.outcome")
if (!all(required_columns %in% colnames(data))) {
stop("The harmonized data is missing required columns!")
}
# Summarize pval.outcome for each id.outcome
pval_summary <- data[, .(
min_pval = min(pval.outcome, na.rm = TRUE),
max_pval = max(pval.outcome, na.rm = TRUE),
mean_pval = mean(pval.outcome, na.rm = TRUE),
median_pval = median(pval.outcome, na.rm = TRUE),
count = .N # Count number of SNPs per id.outcome
), by = id.outcome]
# Display results
# print(pval_summary)
# pval_summary <- pval_summary[order(pval_summary$min_pval),]
# Extract unique traits and common SNPs
traits <- unique(c(data$id.exposure, data$METSIM_name))
common_snps <- unique(data$SNP)
cat("Number of traits:", length(traits), "Number of SNPs:", length(common_snps), "\n")
## Number of traits: 25 Number of SNPs: 4850
# Filter data to include only common SNPs
data <- data[SNP %in% common_snps]
# Reshape data to create a long table for matrix population
long_data <- rbindlist(list(
data[, .(SNP, trait = id.exposure, beta = beta.exposure, se = se.exposure)],
data[, .(SNP, trait = METSIM_name, beta = beta.outcome, se = se.outcome)]
), use.names = TRUE)
# Ensure no duplicate SNP-trait combinations
long_data <- unique(long_data, by = c("SNP", "trait"))
cat("Rows in reshaped long_data:", nrow(long_data), "\n")
## Rows in reshaped long_data: 120688
# Create matrices for betas and ses
betas_mat <- dcast(long_data, SNP ~ trait, value.var = "beta", fill = NA)
ses_mat <- dcast(long_data, SNP ~ trait, value.var = "se", fill = NA)
snps <- betas_mat$SNP
# Convert matrices to the appropriate format for hyprcoloc
rownames(betas_mat) <- betas_mat$SNP
rownames(ses_mat) <- ses_mat$SNP
betas_mat <- as.matrix(betas_mat[, -1, with = FALSE])
ses_mat <- as.matrix(ses_mat[, -1, with = FALSE])
# Restore the SNP column as rownames
rownames(betas_mat) <- snps
rownames(ses_mat) <- snps
# Confirm rownames are correctly set
head(rownames(betas_mat))
## [1] "rs1000364" "rs1002778" "rs1003323" "rs1003767" "rs1003768" "rs1003769"
cat("Dimensions of beta matrix:", dim(betas_mat), "\n")
## Dimensions of beta matrix: 4850 25
cat("Dimensions of SE matrix:", dim(ses_mat), "\n")
## Dimensions of SE matrix: 4850 25
# Check for rows with missing values
rows_with_na_betas <- apply(betas_mat, 1, function(row) any(is.na(row)))
rows_with_na_ses <- apply(ses_mat, 1, function(row) any(is.na(row)))
# Filter matrices to keep only rows with no missing values
betas_mat <- betas_mat[!rows_with_na_betas, , drop = FALSE]
ses_mat <- ses_mat[!rows_with_na_ses, , drop = FALSE]
cat("Dimensions of beta matrix:", dim(betas_mat), "\n")
## Dimensions of beta matrix: 4747 25
cat("Dimensions of SE matrix:", dim(ses_mat), "\n")
## Dimensions of SE matrix: 4747 25
# Ensure SNP filtering consistency
setDT(data)
snps_to_keep <- rownames(betas_mat)
data_subset <- data[SNP %in% snps_to_keep]
cat("Subset data dimensions:", dim(data_subset), "\n")
## Subset data dimensions: 113928 33
cat("Subset data dimensions:", dim(data), "\n")
## Subset data dimensions: 115838 33
# Save filtered data
fwrite(data_subset, "filtered_data.csv")
# Prepare GWAS and eQTL datasets for colocalization
gwasDF <- data_subset[, .(
rsid = SNP,
chr = paste0("chr", chr.exposure),
position = pos.exposure,
pValue = pval.exposure,
maf = eaf.exposure,
beta = beta.exposure,
se = se.exposure
)]
qtlDF <- data_subset[, .(
rsid = SNP,
chr = paste0("chr", chr.outcome),
position = pos.outcome,
pValue = pval.outcome,
maf = eaf.outcome,
beta = beta.outcome,
se = se.outcome
)]
# Ensure p-values are valid
gwasDF[pValue <= 0, pValue := 1e-50]
qtlDF[pValue <= 0, pValue := 1e-50]
# Run colocalization analysis
coloc_results <- xQTLanalyze_coloc_diy(
gwasDF = gwasDF,
qtlDF = qtlDF,
method = "Both" # Runs both coloc and hyprcoloc
)
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 2.84e-97 5.35e-12 5.31e-86 1.00e+00 3.04e-08
## [1] "PP abf for shared variant: 3.04e-06%"
## nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## <num> <num> <num> <num> <num> <num>
## 1: 3313 2.841411e-97 5.348077e-12 5.312957e-86 1 3.041164e-08
## candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob hypr_candidate_snp
## <char> <num> <num> <num> <char>
## 1: rs530804537 0.8517936 1 1 rs11591147
## hypr_posterior_explainedBySnp
## <num>
## 1: 1
# Save results
if (!is.null(coloc_results)) {
fwrite(coloc_results$coloc_Out_summary, "coloc_results_all_genes.csv")
}
# Print summary
print(coloc_results$coloc_Out_summary)
## nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## <num> <num> <num> <num> <num> <num>
## 1: 3313 2.841411e-97 5.348077e-12 5.312957e-86 1 3.041164e-08
## candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob hypr_candidate_snp
## <char> <num> <num> <num> <char>
## 1: rs530804537 0.8517936 1 1 rs11591147
## hypr_posterior_explainedBySnp
## <num>
## 1: 1
library(xQTLbiolinks)
library(data.table)
gwasDF <- fread("http://bioinfo.szbl.ac.cn/xQTL_biolinks/xqtl_data/gwasDFsub.txt")
tissueSiteDetail="Brain - Cerebellum"
sentinelSnpDF <- xQTLanalyze_getSentinelSnp(gwasDF, pValueThreshold = 5e-08)
traitsAll <- xQTLanalyze_getTraits(sentinelSnpDF, detectRange=1e6, tissueSiteDetail=tissueSiteDetail)
traitsAll
outputs <- rbindlist(lapply( unique(traitsAll$gencodeId), function(x){ # using gencode ID.
xQTLanalyze_coloc(gwasDF, x, tissueSiteDetail=tissueSiteDetail, method = "Both")$colocOut }))
## == Downloading eQTL of ENSG00000002587.9 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 1.73e-05 3.22e-02 6.60e-05 1.22e-01 8.46e-01
## [1] "PP abf for shared variant: 84.6%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000002587 6452 1.730175e-05 0.0321843 6.603361e-05 0.1219884
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.845744 rs13120565 0.4140146 0.5685 0.9694
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.2726
## == Downloading eQTL of ENSG00000109684.14 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 7.10e-11 1.32e-07 3.89e-06 6.25e-03 9.94e-01
## [1] "PP abf for shared variant: 99.4%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000109684 7107 7.097893e-11 1.32221e-07 3.890211e-06 0.00625302
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.993743 rs13120565 0.5328849 0.9793 0.9999
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.4747
## == Downloading eQTL of ENSG00000261490.1 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 5.29e-05 9.85e-02 4.80e-04 8.94e-01 6.63e-03
## [1] "PP abf for shared variant: 0.663%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000261490 6601 5.287051e-05 0.09848309 0.0004801374 0.8943562
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.00662768 rs13120565 0.421965 0 0.0101
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.4118
outputs
# Examine the format
multi_traits <- traitsAll$gencodeId
test_gene <- unique(traitsAll$gencodeId)[1] # Select one gene
eqtl_data <- xQTLanalyze_coloc(gwasDF, test_gene, tissueSiteDetail=tissueSiteDetail, method="Both")
## == Downloading eQTL of ENSG00000002587.9 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 1.73e-05 3.22e-02 6.60e-05 1.22e-01 8.46e-01
## [1] "PP abf for shared variant: 84.6%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000002587 6452 1.730175e-05 0.0321843 6.603361e-05 0.1219884
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.845744 rs13120565 0.4140146 0.5685 0.9694
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.2726
# Print the returned object structure
merged_data <- xQTLanalyze_coloc(gwasDF, test_gene, tissueSiteDetail=tissueSiteDetail, method="Both")$gwasEqtlInfo
## == Downloading eQTL of ENSG00000002587.9 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 1.73e-05 3.22e-02 6.60e-05 1.22e-01 8.46e-01
## [1] "PP abf for shared variant: 84.6%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000002587 6452 1.730175e-05 0.0321843 6.603361e-05 0.1219884
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.845744 rs13120565 0.4140146 0.5685 0.9694
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.2726
# View structure
str(merged_data)
## Classes 'data.table' and 'data.frame': 6452 obs. of 11 variables:
## $ rsid : chr "rs10000201" "rs10000222" "rs10000318" "rs10000369" ...
## $ chr : chr "chr4" "chr4" "chr4" "chr4" ...
## $ position : num 12218673 12262368 11613397 11234631 11595842 ...
## $ pValue.gwas: num 0.242 0.637 0.581 0.0453 0.237 0.505 0.945 0.166 0.0568 0.193 ...
## $ maf.gwas : num 0.6003 0.9983 0.2976 0.0459 0.7449 ...
## $ beta.gwas : num 0.0034 -0.02555 0.00174 -0.01415 0.0038 ...
## $ se.gwas : num 0.00291 0.05421 0.00316 0.00707 0.00322 ...
## $ maf.eqtl : num 0.4115 0.0144 0.2512 0.067 0.2703 ...
## $ pValue.eqtl: num 0.9096 0.9513 0.0869 0.0461 0.7072 ...
## $ beta.eqtl : num 0.00692 -0.01422 -0.11557 0.25867 -0.02498 ...
## $ se.eqtl : num 0.0608 0.2323 0.0671 0.1287 0.0664 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "rsid"
head(merged_data)
# List of eQTL genes to test
eqtl_genes <- unique(traitsAll$gencodeId) # List of trait genes
# Initialize a list to store results
merged_data_list <- list()
# Loop through each gene and perform colocalization analysis
for (gene in eqtl_genes) {
cat("Processing gene:", gene, "\n")
# Run colocalization analysis for this gene
coloc_result <- xQTLanalyze_coloc(gwasDF, gene, tissueSiteDetail=tissueSiteDetail, method="Both")
# Extract the merged GWAS + eQTL dataset
if (!is.null(coloc_result$gwasEqtlInfo)) {
merged_data_list[[gene]] <- coloc_result$gwasEqtlInfo
}
}
## Processing gene: ENSG00000002587
## == Downloading eQTL of ENSG00000002587.9 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 1.73e-05 3.22e-02 6.60e-05 1.22e-01 8.46e-01
## [1] "PP abf for shared variant: 84.6%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000002587 6452 1.730175e-05 0.0321843 6.603361e-05 0.1219884
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.845744 rs13120565 0.4140146 0.5685 0.9694
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.2726
## Processing gene: ENSG00000109684
## == Downloading eQTL of ENSG00000109684.14 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 7.10e-11 1.32e-07 3.89e-06 6.25e-03 9.94e-01
## [1] "PP abf for shared variant: 99.4%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000109684 7107 7.097893e-11 1.32221e-07 3.890211e-06 0.00625302
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.993743 rs13120565 0.5328849 0.9793 0.9999
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.4747
## Processing gene: ENSG00000261490
## == Downloading eQTL of ENSG00000261490.1 ... > Success!
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 5.29e-05 9.85e-02 4.80e-04 8.94e-01 6.63e-03
## [1] "PP abf for shared variant: 0.663%"
## Key: <traitGene>
## traitGene nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <char> <num> <num> <num> <num> <num>
## 1: ENSG00000261490 6601 5.287051e-05 0.09848309 0.0004801374 0.8943562
## PP.H4.abf candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob
## <num> <char> <num> <num> <num>
## 1: 0.00662768 rs13120565 0.421965 0 0.0101
## hypr_candidate_snp hypr_posterior_explainedBySnp
## <char> <num>
## 1: rs13120565 0.4118
# Combine all results into a single data.table
merged_data_combined <- rbindlist(merged_data_list, idcol="gene")
# Save to file
#fwrite(merged_data_combined, "merged_gwas_eqtl_all_traits.csv")
# Print structure and first few rows
str(merged_data_combined)
## Classes 'data.table' and 'data.frame': 20160 obs. of 12 variables:
## $ gene : chr "ENSG00000002587" "ENSG00000002587" "ENSG00000002587" "ENSG00000002587" ...
## $ rsid : chr "rs10000201" "rs10000222" "rs10000318" "rs10000369" ...
## $ chr : chr "chr4" "chr4" "chr4" "chr4" ...
## $ position : num 12218673 12262368 11613397 11234631 11595842 ...
## $ pValue.gwas: num 0.242 0.637 0.581 0.0453 0.237 0.505 0.945 0.166 0.0568 0.193 ...
## $ maf.gwas : num 0.6003 0.9983 0.2976 0.0459 0.7449 ...
## $ beta.gwas : num 0.0034 -0.02555 0.00174 -0.01415 0.0038 ...
## $ se.gwas : num 0.00291 0.05421 0.00316 0.00707 0.00322 ...
## $ maf.eqtl : num 0.4115 0.0144 0.2512 0.067 0.2703 ...
## $ pValue.eqtl: num 0.9096 0.9513 0.0869 0.0461 0.7072 ...
## $ beta.eqtl : num 0.00692 -0.01422 -0.11557 0.25867 -0.02498 ...
## $ se.eqtl : num 0.0608 0.2323 0.0671 0.1287 0.0664 ...
## - attr(*, ".internal.selfref")=<externalptr>
head(merged_data_combined)
# Run colocalization for all genes using the existing merged data
coloc_results <- xQTLanalyze_coloc_diy(
gwasDF = merged_data_combined[, .(rsid, chr, position,
pValue = pValue.gwas,
maf = maf.gwas,
beta = beta.gwas,
se = se.gwas)],
qtlDF = merged_data_combined[, .(rsid, chr, position,
pValue = pValue.eqtl,
maf = maf.eqtl,
beta = beta.eqtl,
se = se.eqtl)],
method = "Both" # Run both coloc & hyprcoloc
)
## PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## 9.25e-14 1.56e-10 4.60e-06 6.95e-03 9.93e-01
## [1] "PP abf for shared variant: 99.3%"
## nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## <num> <num> <num> <num> <num> <num>
## 1: 11031 9.252856e-14 1.562464e-10 4.597347e-06 0.006947266 0.9930481
## candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob hypr_candidate_snp
## <char> <num> <num> <num> <char>
## 1: rs13120565 0.505961 0.9793 0.9999 rs13120565
## hypr_posterior_explainedBySnp
## <num>
## 1: 0.4747
# Save results to CSV
if (!is.null(coloc_results)) {
fwrite(coloc_results$coloc_Out_summary, "coloc_results_all_genes.csv")
}
# Print results
print(coloc_results$coloc_Out_summary)
## nsnps PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf
## <num> <num> <num> <num> <num> <num>
## 1: 11031 9.252856e-14 1.562464e-10 4.597347e-06 0.006947266 0.9930481
## candidate_snp SNP.PP.H4 hypr_posterior hypr_regional_prob hypr_candidate_snp
## <char> <num> <num> <num> <char>
## 1: rs13120565 0.505961 0.9793 0.9999 rs13120565
## hypr_posterior_explainedBySnp
## <num>
## 1: 0.4747