Here, we conduct a pathway enrichment analysis to investigate pleiotropy within biologically defined pathways for top SNPs identified in the Sun et al. (2023) UK Biobank Pharma Proteomics Project (UKB-PPP) 1. The study’s Supplementary Table 9 (ST9), which we utilized, includes 30,743 significant pQTLs (P < 1.7 × 10⁻¹¹) associated with 2,415 Olink proteins.
Our primary objective is to identify and characterize patterns of pleiotropy—where a single genetic variant influences multiple proteins—within pathways, with a specific focus on the chemokine signaling pathways.
This analysis is motivated by the need to better understand the biological, functional, and technical implications of pleiotropic genetic variants in the context of proteomics.
Source: We utilized data from the UK Biobank Pharma Proteomics Project (UKB-PPP) from Supplementary Table 9 of Sun et al. (2023). This dataset includes 30,743 significant protein quantitative trait loci (pQTLs) with a significance threshold of \(P < 1.7 \times 10^{-11}\) linked to 2,415 Olink proteins.
Initial Processing: SNPs were identified by a unique combination of variant ID, associated gene, and rsID.
Filtering for Pleiotropy:
enrichGO
function from the clusterProfiler package, focusing on
biological processes. Genes were mapped to Gene Ontology terms using the
org.Hs.eg.db database.Proportion Calculation: For each SNP, we calculated the proportion of proteins it affects within a given pathway:
\[ \text{Proportion}_{\text{SNP}} = \frac{\text{Number of Linked Proteins}}{\text{Total Proteins in Pathway}} \]
Pathway-Wide Proportions: A weighted average was computed for each pathway, where the weights were the total number of proteins in each pathway.
# Suppress output while loading libraries
suppressPackageStartupMessages({
library(stringr)
library(purrr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)
library(tidyr)
library(ggplot2)
library(DT)
library(dplyr)
library(data.table)
})
# Load the necessary library for better readability (optional)
setwd("/Users/charleenadams/temp_BI/chemokine_rgs_olink")
sun_sigs_ST9 <- read.csv("/Users/charleenadams/temp_BI/chemokine_rgs_olink/sun_sigs.csv")
sun_sigs_ST9$ID <- paste0(sun_sigs_ST9$Variant.ID..CHROM.GENPOS..hg37..A0.A1.imp.v1., "_", sun_sigs_ST9$Bioinfomatic.annotated.gene, "_", sun_sigs_ST9$rsID)
unique_sig_protein <- unique(sun_sigs_ST9$Assay.Target)
# Find common rsIDs across different Assay.Targets
common_rsids <- sun_sigs_ST9 %>%
group_by(ID) %>%
filter(n_distinct(Assay.Target) > 1) %>%
distinct(ID, Assay.Target) %>%
arrange(ID)
#write.csv(common_rsids,"/Users/charleenadams/temp_BI/chemokine_rgs_olink/common_rsids_all_SUN.csv")
# testid <- common_rsids[which(common_rsids$ID=="3:56849749:T:C:imp:v1_ARHGEF3_rs1354034"),]
# # Clean up the IDs in the common_rsids data frame
# common_rsids$original_Id <- common_rsids$ID
# common_rsids <- common_rsids %>%
# mutate(ID = str_replace(ID, "(:[ATCG]+:[ATCG]+:imp:v1)", "")) # Remove only the undesired pattern
# Alternate way to clean those ids:
# clean_ids <- function(ids) {
# sub(":[^:]+:[^:]+:imp:v1", "_", ids) # Remove `:T:C:imp:v1` or similar patterns
# }
# Count unique Assay.Targets by ID and sort in descending order
assay_targets_by_id <- common_rsids %>%
dplyr::group_by(ID) %>% # Group by ID
dplyr::summarize(unique_assay_targets = n_distinct(Assay.Target), .groups = "drop") %>% # Count unique Assay.Targets
dplyr::arrange(desc(unique_assay_targets)) # Arrange in descending order
# Filter by 3rd quartile: shared SNP >=4
summary(assay_targets_by_id$unique_assay_targets)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 5.941 4.000 444.000
# Count unique Assay.Targets by ID and filter by unique_assay_targets >= 4
assay_targets_by_id <- common_rsids %>%
dplyr::group_by(ID) %>%
dplyr::summarize(unique_assay_targets = n_distinct(Assay.Target), .groups = "drop") %>%
dplyr::filter(unique_assay_targets >= 4)
# Filter common_rsids by IDs with unique_assay_targets >= 4
filtered_common_rsids <- common_rsids %>%
dplyr::filter(ID %in% assay_targets_by_id$ID)
# Generate pathway enrichment results
genes <- unique(filtered_common_rsids$Assay.Target)
pathway_results <- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
# Create the dotplot
enrichment_plot_20 <- dotplot(
pathway_results,
showCategory = 20, # Number of categories to display
title = "Pathway Enrichment for Top SNPs Shared by >=4 Proteins in Sun ST9" # Plot title
)
enrichment_plot_20
# Save the plot as a high-resolution TIFF for Nature
ggsave(
filename = "/Users/charleenadams/temp_BI/chemokine_rgs_olink/20_pathway_enrichment_high_res.tiff",
plot = enrichment_plot_20,
width =10, # Width in inches
height = 10, # Height in inches
dpi = 600, # High DPI for publication
units = "in", # Specify units (inches)
device = "tiff", # Save as TIFF
compression = "lzw" # Use LZW compression for smaller file size
)
suppressPackageStartupMessages({
library(stringr)
library(purrr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(tidyr)
library(ggplot2)
library(DT)
library(dplyr)
library(data.table)
})
# Step 1: Extract the total number of proteins/genes for each pathway
pathway_data <- pathway_results@result %>%
as_tibble() %>% # Ensure tibble for compatibility
dplyr::select(Pathway_ID = ID, Description, geneID) %>%
mutate(Genes = strsplit(as.character(geneID), "/")) %>% # Ensure geneID is character
unnest(cols = c(Genes)) # Expand the Genes list column into rows
pathway_totals <- pathway_data %>%
group_by(Pathway_ID, Description) %>%
summarise(
Total_Proteins = n_distinct(Genes), # Count unique genes for each pathway
.groups = "drop"
)
all_proteins_snp <- filtered_common_rsids %>%
mutate(rsID = stringr::str_extract(ID, "rs\\d+")) %>% # Extract rsID using regex
left_join(pathway_data, by = c("Assay.Target" = "Genes"), relationship = "many-to-many") %>%
na.omit()
# Step 2: Join the pathway totals with all SNP data
all_snps_with_proportions <- all_proteins_snp %>%
group_by(ID, rsID, Pathway_ID, Description) %>%
summarise(
GeneCount = n(), # Count how many proteins/genes each SNP is linked to
Proteins = paste(unique(Assay.Target), collapse = ", "), # Combine protein names
.groups = "drop"
) %>%
left_join(pathway_totals, by = c("Pathway_ID", "Description")) %>%
mutate(
Proportion = GeneCount / Total_Proteins # Calculate the proportion for each SNP
)
# Step 3: Inspect the updated data
all_snps_with_proportions_rounded <- all_snps_with_proportions
num_cols <- sapply(all_snps_with_proportions_rounded, is.numeric) # Identify numeric columns
all_snps_with_proportions_rounded[, num_cols] <- lapply(all_snps_with_proportions_rounded[, num_cols], round, digits = 3)
datatable(all_snps_with_proportions,
options = list(pageLength = 10, autoWidth = TRUE, scrollX = TRUE),
rownames = FALSE,
caption = 'Pleiotropy Proportions Per Number of Proteins in Pathway')
# Step 4: Save the results
write.csv(all_snps_with_proportions, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/all_snps_with_proportions.csv", row.names = FALSE)
# Load required libraries
library(dplyr)
library(furrr)
library(data.table)
# # Step 0: Read the data
# all_snps_with_proportions <- fread("/Users/charleenadams/temp_BI/chemokine_rgs_olink/all_snps_with_proportions.csv")
#
# # Step 2: Set up parallel processing with proper random number generation
# plan(multisession, workers = parallel::detectCores() - 1)
# set.seed(123)
# options(future.rng.onMisuse = "ignore")
#
# # Step 3: Perform permutation testing
# n_permutations <- 1000
# permutation_results <- future_map_dfr(
# 1:n_permutations,
# ~{
# shuffled_proportions <- sample(all_snps_with_proportions$Proportion, replace = FALSE)
# tibble(Shuffled_Proportion = shuffled_proportions)
# },
# .progress = TRUE,
# .options = furrr_options(seed = TRUE)
# )
#
# # Step 4: Capture observed proportions
# observed_proportions <- all_snps_with_proportions$Proportion
#
# # Validate that observed proportions are numeric
# if (any(is.na(observed_proportions)) || !is.numeric(observed_proportions)) {
# stop("Observed proportions contain NA or non-numeric values. Check data.")
# }
#
# # Step 5: Compare observed proportions with permutations in parallel
# empirical_p_values <- future_map_dbl(
# observed_proportions,
# ~ mean(permutation_results$Shuffled_Proportion >= .x, na.rm = TRUE),
# .progress = TRUE
# )
#
# # Step 6: Add empirical p-values back to the original data
# all_snps_with_proportions <- all_snps_with_proportions %>%
# mutate(Empirical_P_Value = empirical_p_values)
#
# # Step 7: Save results
# fwrite(
# all_snps_with_proportions,
# "/Users/charleenadams/temp_BI/chemokine_rgs_olink/proportions_with_pvalues.csv"
# )
#
all_snps_with_proportions <- fread(
"/Users/charleenadams/temp_BI/chemokine_rgs_olink/proportions_with_pvalues.csv")
# # Check for significant results
# noms <- all_snps_with_proportions[which(all_snps_with_proportions$Empirical_P_Value < 0.05),]
# noms <- noms[order(noms$Empirical_P_Value),]
chemact=all_snps_with_proportions[which(all_snps_with_proportions$Description=="chemokine activity"),]
#
# # Step 1 & 2: Group by pathway and count significant results
# enrichment_by_pathway <- all_snps_with_proportions %>%
# group_by(Pathway_ID, Description) %>%
# summarise(
# Total_SNPs = n(),
# Significant_SNPs = sum(Empirical_P_Value < 0.05, na.rm = TRUE),
# .groups = 'drop'
# )
#
# # Optionally, you might want to sort by the number of significant SNPs
# enrichment_by_pathway <- enrichment_by_pathway %>%
# arrange(desc(Significant_SNPs))
#
# # Step 3: Perform a test for enrichment
# # Here we'll use a binomial test where we compare the number of significant SNPs against what we expect by chance.
#
# # Expected number of significant SNPs if distributed evenly:
# expected_significant_snps <- nrow(all_snps_with_proportions) * 0.05 # Assuming 5% significance threshold
#
# enrichment_by_pathway <- enrichment_by_pathway %>%
# mutate(
# # Enrichment ratio now compares number of significant SNPs to expected
# Enrichment_Ratio = Significant_SNPs / (Total_SNPs * 0.05), # Expected proportion under null hypothesis
# # Binomial test for each pathway
# P_Value = pbinom(Significant_SNPs, Total_SNPs, 0.05, lower.tail = FALSE)
# )
#
# # Adjust for multiple comparisons using Bonferroni correction
# enrichment_by_pathway <- enrichment_by_pathway %>%
# mutate(Adjusted_P_Value = p.adjust(P_Value, method = "bonferroni"))
#
# sig_enrich <- enrichment_by_pathway[which(enrichment_by_pathway$Adjusted_P_Value<0.05),]
# dim(sig_enrich)
#
# # Save these results:
# write.csv(enrichment_by_pathway, "pathway_enrichment_results.csv", row.names = FALSE)
pathway_enrichment_results <- fread("pathway_enrichment_results.csv")
library(dplyr)
library(ggplot2)
library(scales)
# Step 1 & 2: Group by pathway and count significant results
enrichment_by_pathway <- all_snps_with_proportions %>%
group_by(Pathway_ID, Description) %>%
summarise(
Total_SNPs = n(),
Significant_SNPs = sum(Empirical_P_Value < 0.05, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
# Calculate the proportion of significant SNPs
Prop_Significant = Significant_SNPs / Total_SNPs
)
# Optionally, you might want to sort by the number of significant SNPs
enrichment_by_pathway <- enrichment_by_pathway %>%
arrange(desc(Significant_SNPs))
# Step 3: Perform a test for enrichment
enrichment_by_pathway <- enrichment_by_pathway %>%
mutate(
Enrichment_Ratio = Significant_SNPs / (Total_SNPs * 0.05),
P_Value = pbinom(Significant_SNPs, Total_SNPs, 0.05, lower.tail = FALSE)
)
# Adjust for multiple comparisons using Bonferroni correction
enrichment_by_pathway <- enrichment_by_pathway %>%
mutate(Adjusted_P_Value = p.adjust(P_Value, method = "bonferroni"))
#tail(enrichment_by_pathway, n=100)
tots4 <- enrichment_by_pathway[which(enrichment_by_pathway$Adjusted_P_Value<0.05),]
summary(enrichment_by_pathway$Total_SNPs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 10.00 19.28 22.00 181.00
chemokine_activity_enrich=enrichment_by_pathway[which(enrichment_by_pathway$Description=="chemokine activity"),]
summary(tots4)
## Pathway_ID Description Total_SNPs Significant_SNPs
## Length:192 Length:192 Min. : 1.00 Min. : 1.00
## Class :character Class :character 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Mode :character Median : 2.00 Median : 2.00
## Mean : 2.49 Mean : 2.49
## 3rd Qu.: 3.00 3rd Qu.: 3.00
## Max. :13.00 Max. :13.00
## Prop_Significant Enrichment_Ratio P_Value Adjusted_P_Value
## Min. :1 Min. :20 Min. :0 Min. :0
## 1st Qu.:1 1st Qu.:20 1st Qu.:0 1st Qu.:0
## Median :1 Median :20 Median :0 Median :0
## Mean :1 Mean :20 Mean :0 Mean :0
## 3rd Qu.:1 3rd Qu.:20 3rd Qu.:0 3rd Qu.:0
## Max. :1 Max. :20 Max. :0 Max. :0
tots4
## # A tibble: 192 × 8
## Pathway_ID Description Total_SNPs Significant_SNPs Prop_Significant
## <chr> <chr> <int> <int> <dbl>
## 1 GO:0005165 neurotrophin recepto… 13 13 1
## 2 GO:0015144 carbohydrate transme… 12 12 1
## 3 GO:0015145 monosaccharide trans… 12 12 1
## 4 GO:0015149 hexose transmembrane… 12 12 1
## 5 GO:0051119 sugar transmembrane … 12 12 1
## 6 GO:0055056 D-glucose transmembr… 12 12 1
## 7 GO:0031681 G-protein beta-subun… 7 7 1
## 8 GO:0004602 glutathione peroxida… 6 6 1
## 9 GO:0017069 snRNA binding 6 6 1
## 10 GO:0050811 GABA receptor binding 6 6 1
## # ℹ 182 more rows
## # ℹ 3 more variables: Enrichment_Ratio <dbl>, P_Value <dbl>,
## # Adjusted_P_Value <dbl>
# Enhanced plot
ggplot(tots4, aes(x = reorder(Description, Significant_SNPs), y = Significant_SNPs)) +
geom_bar(stat = "identity", aes(fill = Prop_Significant), color="black") +
scale_fill_gradient(low = "lightblue", high = "darkblue",
name = "Proportion of\nSignificant SNPs",
labels = percent_format()) +
geom_text(aes(label = Significant_SNPs), vjust = -0.3, size = 3) + # Add count labels
coord_flip() +
labs(
title = "Enrichment of Significant SNPs by Pathway",
subtitle = "Number and Proportion of Significant SNPs",
x = "Pathway Description",
y = "Number of Significant SNPs"
) +
scale_y_continuous(expand = expansion(mult = c(0.05, 0.15))) + # Adjust space for labels
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
axis.text.y = element_text(size = 7), # Adjust text size if needed
legend.position = "right",
panel.grid.major.y = element_blank() # Remove horizontal grid lines for clarity
)