1 Introduction

This analysis investigates the potential genetic overlap between N-acetyltaurine (NAT) and Body Mass Index (BMI) using summary statistics, SuSiE fine-mapping, and colocalization analysis. We focus on a 500KB region upstream and downstream of the sentinel SNP rs147852470 and include only variants with a MAF >0.05. (We will create a separate document to examine MAF >0.01.)

2 Data Preparation

Initially, I obtained the summary statistics for the METSIM NAT data and Jurgen’s (UKB) BMI data and noted they were on different builds. After using LiftOver to get the BMI positions for GRCh38, I loaded the data into RStudio to look at it and harmonized it within TwoSampleMR via harmonize_data(). That code is below.

NB: I subsequently learned that harmonize_data() worked as expected except for multi-allelic variants, which I dealt with later by excluding them and palindromes prior to running SuSiE.

2.1 TwoSampleMR for harmonize_data()

# ---------------------------------------------
# Load Necessary Packages
# ---------------------------------------------
library(coloc)        # For coloc.abf function
library(data.table)   # For fread function
library(dplyr)        # For data manipulation functions like mutate, filter
library(ggplot2)      # For plotting
library(readr)        # For read_table2 function
library(TwoSampleMR)  # For harmonise_data function
library(readxl)       # For reading Excel files
library(biomaRt)      # For merging in TSS info using Ensembl

# ---------------------------------------------
# Load and Format NAT Data
# ---------------------------------------------

# NB: The BP_GRCh37 is actually BP_GRCh38. Testing liftOver confirmed the data is already on GRCh38.
# The sample size is 6099 for NAT from the PheWeb data.
nat_mr <- fread("/Users/charleenadams/temp_BI/phenocode-C100005466_hg38_merged_nat_twosample.tsv")
nat_mr <- as.data.frame(nat_mr)

# Filter out rows with missing or invalid RSIDs
filtered_nat_df <- nat_mr %>% 
  filter(!is.na(rsids) & rsids != "" & grepl("^rs", rsids)) %>%
  arrange(rsids)

# Format harmonized data for exposure
exp_dat <- filtered_nat_df %>%
  mutate(chr.exposure = chrom,
         pos.exposure = BP_GRCh37,   # This is actually BP_GRCh38
         beta.exposure = beta,
         se.exposure = se,
         exposure = "N-acetyltaurine",
         id.exposure = "N-acetyltaurine",
         pval.exposure = pval,
         SNP.exposure = SNP,
         SNP = SNP,
         effect_allele.exposure = effect_allele,
         other_allele.exposure = other_allele,
         eaf.exposure = eaf,
         samplesize.exposure = 6099,
         id_col = nearest_genes)

# ---------------------------------------------
# Load and Format BMI Data
# ---------------------------------------------

# NB: The sample size is 460000 from the Jurgens 2022 README
# Filter out rows with missing or invalid SNPs
filtered_bmi_df <- bmi_mr %>%
  filter(!is.na(SNP) & SNP != "" & grepl("^rs", SNP)) %>%
  arrange(SNP)

# Format harmonized data for outcome
out_dat <- filtered_bmi_df %>%
  mutate(SNP = SNP,
         SNP.outcome = SNP,
         chr.outcome = CHR_x,
         pos.outcome = BP_GRCh38,
         beta.outcome = beta,
         se.outcome = se,
         pval.outcome = pval,
         effect_allele.outcome = effect_allele,
         other_allele.outcome = other_allele,
         eaf.outcome = eaf,
         samplesize.outcome = 460000,
         id.outcome = "BMI",
         outcome = "BMI")

# ---------------------------------------------
# Harmonize Data (Exposure and Outcome)
# ---------------------------------------------

dat <- harmonise_data(
  exposure_dat = exp_dat,
  outcome_dat = out_dat
)

# Save harmonized data
write.csv(dat, file = "/Users/charleenadams/temp_BI/harmonized_nat_BMI_dat.csv", row.names = FALSE)
# Reload harmonized data to ensure consistency
dat <- fread("/Users/charleenadams/temp_BI/harmonized_nat_BMI_dat.csv")

# ---------------------------------------------
# Remove Duplicate SNP Entries: We may LEGACY this but keeping for now
# ---------------------------------------------

# NB: I had initially kept only the first entry where an rsid was duplicated, which
# did not address what happens within the harmonize_data() process for multi-allelic variants....

# Remove duplicates based on SNP
# dups <- dat[duplicated(dat$SNP),]
# dups <- dups[order(dups$SNP),]

# dat_unique <- dat[!duplicated(dat$SNP), ]
# dim(dat_unique)

# ---------------------------------------------

# ---------------------------------------------
# Look at multi-allelic variants and harmonize
# ---------------------------------------------

harmonized_data <- dat %>%
  filter(
    effect_allele.exposure == effect_allele.outcome & 
    other_allele.exposure == other_allele.outcome
  )

# Check the first few rows of the filtered data to ensure it looks correct
dim(harmonized_data)
## [1] 11588423       57
harmonized_data <- harmonized_data[order(harmonized_data$SNP),]

dup_snps <- harmonized_data %>%
  group_by(SNP) %>%
  filter(n() > 1)

# View the duplicates if any are present
# head(dup_snps)

# Get a count of how many SNPs have duplicates
dup_count <- n_distinct(dup_snps$SNP)
print(paste("Number of SNPs with duplicates:", dup_count))
## [1] "Number of SNPs with duplicates: 23568"
# Define a function to check if a SNP is palindromic
is_palindromic <- function(effect_allele, other_allele) {
  # Define palindromic pairs
  palindromic_pairs <- list(c("A", "T"), c("T", "A"), c("C", "G"), c("G", "C"))
  
  # Check if the provided alleles form a palindromic pair
  return(any(mapply(function(x, y) all(c(effect_allele, other_allele) %in% c(x, y)), 
                    sapply(palindromic_pairs, `[[`, 1), sapply(palindromic_pairs, `[[`, 2))))
}

# Apply the function to check for palindromic SNPs in your data
dup_snps <- dup_snps %>%
  mutate(is_palindromic = mapply(is_palindromic, effect_allele.exposure, other_allele.exposure))

# Filter and view only palindromic SNPs
palindromic_snps <- dup_snps %>% filter(is_palindromic)

# View the results
# print(palindromic_snps)
dim(palindromic_snps)
## [1] 12557    58
dim(dup_snps)
## [1] 53414    58
# Identify SNPs with multiple combinations of alleles
multi_allelic_snps <- dup_snps %>%
  group_by(SNP) %>%
  summarize(
    unique_allele_combinations = n_distinct(paste(effect_allele.exposure, other_allele.exposure)),
    .groups = 'drop'
  ) %>%
  filter(unique_allele_combinations > 1)

# View the results
# print(multi_allelic_snps)

# Count the number of SNPs that are multi-allelic
multi_allelic_count <- nrow(multi_allelic_snps)
print(paste("Number of multi-allelic SNPs:", multi_allelic_count))
## [1] "Number of multi-allelic SNPs: 1008"
multi_allelic_df=dup_snps[which(dup_snps$SNP%in%multi_allelic_snps$SNP),]
# head(multi_allelic_df)
multi_allelic_df <- as.data.frame(multi_allelic_df)
# head(multi_allelic_df$pos.exposure)
dim(multi_allelic_df)
## [1] 8113   58
dup_snps=as.data.frame(dup_snps)
# head(dup_snps)
dup_snps$SNP_index=NULL

# Identify rows that are exact duplicates based on all columns except 'SNP'
exact_duplicates <- dup_snps %>%
  group_by(SNP) %>%
  filter(duplicated(across(everything())) | duplicated(across(everything()), fromLast = TRUE))

# Count the number of exact duplicates
exact_dup_count <- nrow(exact_duplicates)
print(paste("Number of exact duplicate rows:", exact_dup_count))
## [1] "Number of exact duplicate rows: 53408"
# View a few exact duplicates
# print(head(exact_duplicates))

# ---------------------------------------------
# Return to the dat file as processed originally without doing anything yet
# about multi-allelic variants or palindromes
# ---------------------------------------------

# Check effect allele frequencies (EAF) differences
dat$eaf_diff <- abs(dat$eaf.exposure - dat$eaf.outcome)
summary(dat$eaf_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.004719 0.016603 0.077164 0.050143 0.998914
# ---------------------------------------------
# Prepare Data for Coloc Analysis
# For the rest of this, stick with the original removal of dups,
# keeping the first entry. We address this later. 
# ---------------------------------------------

# Select relevant columns
columns_to_keep <- c(
  "SNP", "chr.exposure", "chr.outcome", "pos.exposure", "pos.outcome", "effect_allele.exposure", 
  "other_allele.exposure", "effect_allele.outcome", "other_allele.outcome", "beta.exposure", 
  "beta.outcome", "eaf.exposure", "eaf.outcome", "se.exposure", "se.outcome", "exposure", 
  "id.exposure", "outcome", "id.outcome", "pval.exposure", "pval.outcome", "samplesize.exposure", 
  "samplesize.outcome", "id_col", "mr_keep"
)

# Create the coloc-ready dataset
dat_unique <- dat[!duplicated(dat$SNP), ]
dat_unique <- setDT(dat_unique)
str(dat_unique)
## Classes 'data.table' and 'data.frame':   11571724 obs. of  58 variables:
##  $ SNP                   : chr  "rs10" "rs1000000" "rs10000000" "rs10000003" ...
##  $ effect_allele.exposure: chr  "C" "A" "T" "G" ...
##  $ other_allele.exposure : chr  "A" "G" "A" "A" ...
##  $ effect_allele.outcome : chr  "C" "A" "T" "G" ...
##  $ other_allele.outcome  : chr  "A" "G" "A" "A" ...
##  $ beta.exposure         : num  -0.035 -0.025 -0.018 -0.026 -0.0042 0.061 -0.03 -0.018 0.017 -0.032 ...
##  $ beta.outcome          : num  -0.000166 0.000529 -0.009911 0.001551 -0.002036 ...
##  $ eaf.exposure          : num  0.075 0.23 0.073 0.33 0.49 0.0048 0.0093 0.011 0.43 0.049 ...
##  $ eaf.outcome           : num  0.9436 0.2237 0.0601 0.7057 0.5422 ...
##  $ remove                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ palindromic           : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ ambiguous             : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ id.outcome            : chr  "BMI" "BMI" "BMI" "BMI" ...
##  $ CHR_x                 : int  7 12 4 4 4 4 4 4 4 4 ...
##  $ BP                    : int  92383888 126890980 40088896 57561647 85161558 108826383 114553253 172776204 21618674 138223055 ...
##  $ effect_allele.x       : chr  "A" "G" "A" "A" ...
##  $ other_allele.x        : chr  "C" "A" "T" "G" ...
##  $ eaf.x                 : num  0.0564 0.7763 0.9399 0.2943 0.4578 ...
##  $ INFO                  : num  0.939 0.999 0.982 0.998 0.994 ...
##  $ pval.x                : num  0.9685 0.8147 0.0144 0.4536 0.284 ...
##  $ beta.x                : num  0.000166 -0.000529 0.009911 -0.001551 0.002036 ...
##  $ se.x                  : num  0.00421 0.00226 0.00405 0.00207 0.0019 ...
##  $ CHR_y                 : chr  "chr7" "chr12" "chr4" "chr4" ...
##  $ BP_GRCh38.x           : num  9.28e+07 1.26e+08 4.01e+07 5.67e+07 8.42e+07 ...
##  $ SNP.outcome           : chr  "rs10" "rs1000000" "rs10000000" "rs10000003" ...
##  $ chr.outcome           : int  7 12 4 4 4 4 4 4 4 4 ...
##  $ pos.outcome           : num  9.28e+07 1.26e+08 4.01e+07 5.67e+07 8.42e+07 ...
##  $ se.outcome            : num  0.00421 0.00226 0.00405 0.00207 0.0019 ...
##  $ pval.outcome          : num  0.9685 0.8147 0.0144 0.4536 0.284 ...
##  $ samplesize.outcome    : int  460000 460000 460000 460000 460000 460000 460000 460000 460000 460000 ...
##  $ outcome               : chr  "BMI" "BMI" "BMI" "BMI" ...
##  $ chrom                 : int  7 12 4 4 4 4 4 4 4 4 ...
##  $ BP_GRCh37             : int  92754574 126406434 40087276 56695481 84240405 107905227 113632097 171855053 21617051 137301901 ...
##  $ other_allele.y        : chr  "A" "G" "A" "A" ...
##  $ effect_allele.y       : chr  "C" "A" "T" "G" ...
##  $ rsids                 : chr  "rs10" "rs1000000" "rs10000000" "rs10000003" ...
##  $ nearest_genes         : chr  "CDK6" "TMEM132B" "N4BP2" "HOPX" ...
##  $ pval.y                : num  0.31 0.25 0.62 0.18 0.82 0.65 0.76 0.84 0.36 0.45 ...
##  $ beta.y                : num  -0.035 -0.025 -0.018 -0.026 -0.0042 0.061 -0.03 -0.018 0.017 -0.032 ...
##  $ se.y                  : num  0.035 0.022 0.035 0.019 0.018 0.13 0.095 0.087 0.018 0.042 ...
##  $ eaf.y                 : num  0.075 0.23 0.073 0.33 0.49 0.0048 0.0093 0.011 0.43 0.049 ...
##  $ ac                    : num  11282 2811 896 8214 5917 ...
##  $ r2                    : num  1.7e-04 2.2e-04 4.1e-05 3.0e-04 8.9e-06 3.4e-05 1.6e-05 6.6e-06 1.4e-04 9.5e-05 ...
##  $ CHR                   : chr  "chr7" "chr12" "chr4" "chr4" ...
##  $ BP_GRCh38.y           : num  9.31e+07 1.26e+08 4.01e+07 5.58e+07 8.33e+07 ...
##  $ chr.exposure          : int  7 12 4 4 4 4 4 4 4 4 ...
##  $ pos.exposure          : int  92754574 126406434 40087276 56695481 84240405 107905227 113632097 171855053 21617051 137301901 ...
##  $ se.exposure           : num  0.035 0.022 0.035 0.019 0.018 0.13 0.095 0.087 0.018 0.042 ...
##  $ exposure              : chr  "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" ...
##  $ id.exposure           : chr  "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" ...
##  $ pval.exposure         : num  0.31 0.25 0.62 0.18 0.82 0.65 0.76 0.84 0.36 0.45 ...
##  $ SNP.exposure          : chr  "rs10" "rs1000000" "rs10000000" "rs10000003" ...
##  $ samplesize.exposure   : int  6099 6099 6099 6099 6099 6099 6099 6099 6099 6099 ...
##  $ id_col                : chr  "CDK6" "TMEM132B" "N4BP2" "HOPX" ...
##  $ action                : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ SNP_index             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ mr_keep               : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ eaf_diff              : num  0.86861 0.00631 0.01292 0.37568 0.05225 ...
##  - attr(*, ".internal.selfref")=<externalptr>
dat_coloc_ready <- dat_unique[, ..columns_to_keep]
dat_coloc_ready$Gene <- dat_coloc_ready$id_col

# ---------------------------------------------
# Retrieve TSS Information Using biomaRt
# ---------------------------------------------

# Connect to Ensembl to retrieve gene information
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
print("Connected to Ensembl")
## [1] "Connected to Ensembl"
# Retrieve TSS information
gene_list <- unique(dat_coloc_ready$id_col)
tss_data <- getBM(
  attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", "transcription_start_site"),
  filters = "hgnc_symbol", 
  values = gene_list, 
  mart = ensembl)

# Rename columns for better clarity
colnames(tss_data) <- c("Ensembl_ID", "Gene", "Chromosome", "TSS")

# ---------------------------------------------
# Merge TSS Data with Coloc-Ready Data
# ---------------------------------------------

# Convert to data.table for merging
dat_dt <- as.data.table(dat_coloc_ready)
tss_data_dt <- as.data.table(tss_data)

# Ensure the 'Gene' column is of the same type
dat_dt[, Gene := as.character(Gene)]
tss_data_dt[, Gene := as.character(Gene)]

# Remove duplicated rows in tss_data and keep only the first TSS for each gene
tss_data_unique <- tss_data_dt[!duplicated(tss_data_dt$Gene), ]

# Merge data
merged_data <- merge(dat_dt, tss_data_unique, by = "Gene", all.x = TRUE)

# Save merged data
# fwrite(merged_data, "/Users/charleenadams/temp_BI/final_merged_data.csv", sep = ",")

# ---------------------------------------------
# Example: Filtering Data for a Specific Gene (PTER)
# ---------------------------------------------
pter <- merged_data[which(merged_data$Gene == "PTER"), ]
#head(pter)

2.2 Load the Merged Summary Statistics

We begin by loading the harmonized NAT/BMI data and identifying the sentinel SNP for the PTER region.

# Uses the original final_merged_data.csv with first entry of duplicates retained.
# Doesn't address failures to harmonize multi-allelic variants.
# We deal with this later.

# Load the final merged data
final_merged <- fread("/Users/charleenadams/temp_BI/final_merged_data.csv")
final_merged <- setDT(final_merged)

2.3 Identify Sentinel SNP for PTER and Define Region

We identify the sentinel SNP and define a 500KB window around it to extract relevant SNPs.

# Identify the SNP of interest
snp_of_interest <- final_merged %>% filter(SNP == "rs147852470")

# Extract position and chromosome
snp_position <- snp_of_interest$pos.exposure[1]
chromosome <- snp_of_interest$Chromosome[1]

# Define 500KB upstream and downstream boundaries
upstream_limit <- snp_position - 500000
downstream_limit <- snp_position + 500000

# Subset data within the 500KB window
subset_data <- final_merged %>%
  filter(Chromosome == chromosome &
         pos.exposure >= upstream_limit &
         pos.exposure <= downstream_limit)

# Save the subset data
write.table(subset_data, "500kb_up_and_downstream_sentinel_NAT.txt",
            row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
write.csv(subset_data, "500kb_up_and_downstream_sentinel_NAT.csv")

2.4 Examine Gene Distribution

# Check the number of SNPs per gene
table(subset_data$Gene)
## 
##  C1QL3   CUBN MINDY3   PTER   RSU1 
##    193    358   1150   1738   1277

2.5 Load Summary Statistics

# Load summary statistics
summary_stats_file <- "500kb_up_and_downstream_sentinel_NAT.csv"
summary_stats <- fread(summary_stats_file, header = TRUE, stringsAsFactors = FALSE)

# Display dimensions
cat("Summary statistics loaded with dimensions:", dim(summary_stats), "\n")
## Summary statistics loaded with dimensions: 4716 30

2.6 Calculate MAF and Filter SNPs

We calculate the Minor Allele Frequency (MAF) and filter out SNPs with MAF ≤ 0.05.

# Calculate MAF
summary_stats <- summary_stats %>%
  mutate(
    maf.exposure = pmin(eaf.exposure, 1 - eaf.exposure, na.rm = TRUE),
    maf.outcome = pmin(eaf.outcome, 1 - eaf.outcome, na.rm = TRUE)
  ) %>%
  filter(maf.exposure > 0.05)

# Display dimensions after filtering
cat("After MAF filtering, dimensions:", dim(summary_stats), "\n")
## After MAF filtering, dimensions: 2245 32

2.7 Download and Annotate 1000 Genomes GRCh38 VCF Files

  • from UCSC, use wget for each chromosome. For example, for chr1:

wget https://hgdownload.soe.ucsc.edu/gbdb/hg38/1000Genomes/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
  • downloaded copy of dbSNP (00-All.vcf.gz) for GRCh38
  • annotate within a loop using bcftools

#!/bin/bash

# Define the directories and files
VCF_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch"
DBSNP_FILE="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/00-All.vcf.gz"
OUTPUT_DIR="$VCF_DIR/annotated"
mkdir -p "$OUTPUT_DIR"

# Get the list of chromosome VCF files only
VCF_FILES=("$VCF_DIR"/ALL.chr*.vcf.gz)

# Function to process a single VCF file
process_vcf() {
    local vcf_file="$1"
    echo "Processing ${vcf_file}..."

    # Create index for the VCF file
    echo "Creating index for ${vcf_file}..."
    tabix -p vcf "$vcf_file"

    # Annotate the VCF file using dbSNP
    echo "Annotating ${vcf_file}..."
    bcftools annotate -a "$DBSNP_FILE" -c ID "$vcf_file" -O z -o "$OUTPUT_DIR/$(basename "${vcf_file%.vcf.gz}_annotated.vcf.gz")"

    echo "Annotated VCF saved as $OUTPUT_DIR/$(basename "${vcf_file%.vcf.gz}_annotated.vcf.gz")"
}

# Export the function and variables for parallel processing
export -f process_vcf
export DBSNP_FILE OUTPUT_DIR

# Use GNU Parallel to process VCF files
echo "Starting annotation in parallel..."
parallel process_vcf ::: "${VCF_FILES[@]}"

echo "All annotations completed."

# Run in background with: 
nohup ./annotate_rsids.sh > annotate_rsids_log.txt 2>&1 &

2.8 Select the Finnish samples

The NAT summary statistics used Finnish samples.

I retrieved a 1000 Genomes annotation file with population and subpopulation linkers to select out populations: integrated_call_samples_v3.20130502.ALL.panel


#!/bin/bash

# Set the paths
ANNOTATED_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated"
OUTPUT_DIR="$ANNOTATED_DIR/fin"
PANEL_FILE="$ANNOTATED_DIR/integrated_call_samples_v3.20130502.ALL.panel"

# Create the output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

# Get the list of Finnish samples
FIN_SAMPLES=$(awk '$2 == "FIN" {print $1}' "$PANEL_FILE")

# Filter VCF files in parallel
for vcf_file in "$ANNOTATED_DIR"/*.vcf.gz; do
    base_name=$(basename "$vcf_file" .vcf.gz)
    output_file="$OUTPUT_DIR/${base_name}_FIN.vcf.gz"

    # Run the filtering command
    bcftools view -S <(echo "$FIN_SAMPLES") -O z -o "$output_file" "$vcf_file" &
done

# Wait for all background processes to finish
wait

echo "Filtering completed. Finnish samples are saved in $OUTPUT_DIR."

# Run in background with: 
nohup ./filter_fin_population.sh > filter_fin_population_log.txt 2>&1 &

2.9 Make chr10 bfiles with MAF >0.05 for Finnish population

NB: Without MAF filtering, the subsequent LD matrix will be filled with NaNs.

  • must make tabix file: tabix -p vcf ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_FIN.vcf.gz

#!/bin/bash

# ----------------------------------------------------------------------------------
# Script: create_bfiles_from_vcf.sh
# Description: Convert a VCF.gz file to PLINK binary files with specified filtering options.
# Usage: ./create_bfiles_from_vcf.sh
# ----------------------------------------------------------------------------------

# ------------------------------
# Step 1: Define File Paths
# ------------------------------

# Input VCF.gz file path
INPUT_VCF="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_FIN.vcf.gz"

# Output PLINK binary files prefix
OUTPUT_PREFIX="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/ALL.chr10.maf_filtered_fin2"

# ------------------------------
# Step 2: Check for Required Tools
# ------------------------------

# Function to check if a command exists
command_exists () {
    command -v "$1" >/dev/null 2>&1 ;
}

# Check for plink2
if ! command_exists plink2 ; then
    echo "Error: plink2 is not installed or not in your PATH."
    echo "Please install plink2 before running this script."
    exit 1
fi

# Check for tabix
if ! command_exists tabix ; then
    echo "Error: tabix is not installed or not in your PATH."
    echo "Please install tabix before running this script."
    exit 1
fi

# ------------------------------
# Step 3: Verify Input VCF File
# ------------------------------

if [ ! -f "$INPUT_VCF" ]; then
    echo "Error: Input VCF file not found at $INPUT_VCF"
    exit 1
fi

# Check if VCF is compressed with gzip
if [[ "$INPUT_VCF" != *.gz ]]; then
    echo "Error: Input VCF file must be gzip-compressed (.vcf.gz)"
    exit 1
fi

# ------------------------------
# Step 4: Index the VCF File (If Not Already Indexed)
# ------------------------------

INDEX_FILE="${INPUT_VCF}.tbi"

if [ ! -f "$INDEX_FILE" ]; then
    echo "VCF index (.tbi) not found. Creating index with tabix..."
    tabix -p vcf "$INPUT_VCF"
    if [ $? -ne 0 ]; then
        echo "Error: Failed to index VCF file."
        exit 1
    fi
    echo "VCF file indexed successfully."
else
    echo "VCF index (.tbi) already exists."
fi

# ------------------------------
# Step 5: Run plink2 Command
# ------------------------------

echo "Starting plink2 conversion from VCF to PLINK binary files..."

plink2 \
  --vcf "$INPUT_VCF" \
  --keep-allele-order \
  --maf 0.05 \
  --make-bed \
  --max-alleles 2 \
  --snps-only just-acgt \
  --out "$OUTPUT_PREFIX" \
  --threads 4 \
  --memory 16000

# ------------------------------
# Step 6: Verify Output
# ------------------------------

if [ -f "${OUTPUT_PREFIX}.bed" ] && [ -f "${OUTPUT_PREFIX}.bim" ] && [ -f "${OUTPUT_PREFIX}.fam" ]; then
    echo "plink2 conversion completed successfully."
    echo "Output files:"
    echo "${OUTPUT_PREFIX}.bed"
    echo "${OUTPUT_PREFIX}.bim"
    echo "${OUTPUT_PREFIX}.fam"
else
    echo "Error: plink2 failed to create all output files."
    echo "Please check the error messages above for more details."
    exit 1
fi

echo "Done."

# Run with: 

nohup ./create_bfiles_from_vcf_fin.sh > create_bfiles_from_vcf_fin_log.txt 2>&1 &

2.10 Loop Template for Finnish bfiles (not run; not needed right now)


#!/bin/bash

# Set the directory containing the FIN VCF files
vcf_dir="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin"
output_dir="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/bfiles_filtered"

# Create output directory if it doesn't exist
mkdir -p "$output_dir"

# Loop through each VCF file in the directory
for vcf_file in "$vcf_dir"/*.vcf.gz; do
    # Get the base name of the VCF file (without the extension)
    base_name=$(basename "$vcf_file" .vcf.gz)
    
    # Create bfiles using plink2 with filtering
    plink2 --vcf "$vcf_file" \
           --maf 0.05 \
           --make-bed \
           --out "$output_dir/$base_name" \
           --allow-extra-chr \
           --keep-allele-order \
           --snps-only just-acgt \
           --max-alleles 2

    echo "Converted and filtered $vcf_file to bfiles."
done

# Run in background with: 
nohup ./create_filtered_bfiles.sh > create_filtered_bfiles_log.txt 2>&1 &

2.11 Create Finnish LD Matrix for PTER Region Around Sentinel SNP

Selected 1MB up and downstream of the PTER sentinel SNP, as we use R later to narrow to the 500KB region.


#!/bin/bash

# Define input files and parameters
PLINK_PATH="/usr/local/bin/plink2"
DATA_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered"
OUTPUT_DIR="$DATA_DIR/chr10_ld_matrix"
CHR=10
START=15402141
END=17402141

# Ensure the output directory exists
mkdir -p "$OUTPUT_DIR"

# Input bfile for chromosome 10 after MAF filtering
INPUT_BFILE="$DATA_DIR/ALL.chr${CHR}.maf_filtered_fin2"

# Generate the LD matrix using PLINK 2 with the MAF-filtered data
echo "Generating LD matrix for chromosome $CHR using MAF-filtered data..."
$PLINK_PATH --bfile "$INPUT_BFILE" \
    --chr "$CHR" \
    --from-bp "$START" \
    --to-bp "$END" \
    --r-phased square \
    --out "$OUTPUT_DIR/LD_matrix_chr${CHR}_${START}_${END}_maf_filtered_fin2"

# Check if the command succeeded
if [ $? -ne 0 ]; then
    echo "Error during LD matrix calculation. Check the log file for details."
    exit 1
fi

echo "LD matrix generation for chromosome $CHR completed successfully."

# Run in background with: 
nohup ./make_fin_filtered_matrix_maf.sh > make_fin_filtered_matrix_maf_log.txt 2>&1 &

# Dataframe version
nohup ./make_fin_filtered_matrix_maf_df.sh > make_fin_filtered_matrix_maf_df_log.txt 2>&1 &

2.12 Select the EUR Samples

The Jurgens BMI summary statistics uses UK Biobank samples, which have a different LD pattern than the Finnish population.


#!/bin/bash

# Set the paths
ANNOTATED_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated"
OUTPUT_DIR="$ANNOTATED_DIR/eur"
PANEL_FILE="$ANNOTATED_DIR/integrated_call_samples_v3.20130502.ALL.panel"

# Create the output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

# Get the list of European samples and write to a file
EUR_SAMPLE_FILE="$OUTPUT_DIR/eur_samples.txt"
awk '$3 == "EUR" {print $1}' "$PANEL_FILE" > "$EUR_SAMPLE_FILE"

# Filter VCF files in parallel
for vcf_file in "$ANNOTATED_DIR"/*.vcf.gz; do
    base_name=$(basename "$vcf_file" .vcf.gz)
    output_file="$OUTPUT_DIR/${base_name}_EUR.vcf.gz"

    # Run the filtering command
    bcftools view -S "$EUR_SAMPLE_FILE" -O z -o "$output_file" "$vcf_file" &
done

# Wait for all background processes to finish
wait

echo "Filtering completed. European samples are saved in $OUTPUT_DIR."

2.13 Make chr10 bfiles with MAF >0.05 for EUR Population

NB: Without MAF filtering, the subsequent LD matrix will be filled with NaNs.

  • must make tabix file: tabix -p vcf ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_EUR.vcf.gz

#!/bin/bash

# ----------------------------------------------------------------------------------
# Script: create_bfiles_from_vcf.sh
# Description: Convert a VCF.gz file to PLINK binary files with specified filtering options.
# Usage: ./create_bfiles_from_vcf.sh
# ----------------------------------------------------------------------------------

# ------------------------------
# Step 1: Define File Paths
# ------------------------------

# Input VCF.gz file path
INPUT_VCF="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_EUR.vcf.gz"

# Output PLINK binary files prefix
OUTPUT_PREFIX="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/ALL.chr10.maf_filtered_EUR"

# ------------------------------
# Step 2: Check for Required Tools
# ------------------------------

# Function to check if a command exists
command_exists () {
    command -v "$1" >/dev/null 2>&1 ;
}

# Check for plink2
if ! command_exists plink2 ; then
    echo "Error: plink2 is not installed or not in your PATH."
    echo "Please install plink2 before running this script."
    exit 1
fi

# Check for tabix
if ! command_exists tabix ; then
    echo "Error: tabix is not installed or not in your PATH."
    echo "Please install tabix before running this script."
    exit 1
fi

# ------------------------------
# Step 3: Verify Input VCF File
# ------------------------------

if [ ! -f "$INPUT_VCF" ]; then
    echo "Error: Input VCF file not found at $INPUT_VCF"
    exit 1
fi

# Check if VCF is compressed with gzip
if [[ "$INPUT_VCF" != *.gz ]]; then
    echo "Error: Input VCF file must be gzip-compressed (.vcf.gz)"
    exit 1
fi

# ------------------------------
# Step 4: Index the VCF File (If Not Already Indexed)
# ------------------------------

INDEX_FILE="${INPUT_VCF}.tbi"

if [ ! -f "$INDEX_FILE" ]; then
    echo "VCF index (.tbi) not found. Creating index with tabix..."
    tabix -p vcf "$INPUT_VCF"
    if [ $? -ne 0 ]; then
        echo "Error: Failed to index VCF file."
        exit 1
    fi
    echo "VCF file indexed successfully."
else
    echo "VCF index (.tbi) already exists."
fi

# ------------------------------
# Step 5: Run plink2 Command
# ------------------------------

echo "Starting plink2 conversion from VCF to PLINK binary files..."

plink2 \
  --vcf "$INPUT_VCF" \
  --keep-allele-order \
  --maf 0.05 \
  --make-bed \
  --max-alleles 2 \
  --snps-only just-acgt \
  --out "$OUTPUT_PREFIX" \
  --threads 4 \
  --memory 16000

# ------------------------------
# Step 6: Verify Output
# ------------------------------

if [ -f "${OUTPUT_PREFIX}.bed" ] && [ -f "${OUTPUT_PREFIX}.bim" ] && [ -f "${OUTPUT_PREFIX}.fam" ]; then
    echo "plink2 conversion completed successfully."
    echo "Output files:"
    echo "${OUTPUT_PREFIX}.bed"
    echo "${OUTPUT_PREFIX}.bim"
    echo "${OUTPUT_PREFIX}.fam"
else
    echo "Error: plink2 failed to create all output files."
    echo "Please check the error messages above for more details."
    exit 1
fi

echo "Done."

2.14 Create EUR LD Matrix for PTER Region Around Sentinel SNP


#!/bin/bash

# Define input files and parameters
PLINK_PATH="/usr/local/bin/plink2"
DATA_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered"
OUTPUT_DIR="$DATA_DIR/chr10_ld_matrix"
CHR=10
START=15402141
END=17402141

# Ensure the output directory exists
mkdir -p "$OUTPUT_DIR"

# Input bfile for chromosome 10 after MAF filtering
INPUT_BFILE="$DATA_DIR/ALL.chr${CHR}.maf_filtered_EUR"

# Generate the LD matrix using PLINK 2 with the MAF-filtered data
echo "Generating LD matrix for chromosome $CHR using MAF-filtered data..."
$PLINK_PATH --bfile "$INPUT_BFILE" \
    --chr "$CHR" \
    --from-bp "$START" \
    --to-bp "$END" \
    --r-phased square \
    --out "$OUTPUT_DIR/LD_matrix_chr${CHR}_${START}_${END}_maf_filtered"

# Check if the command succeeded
if [ $? -ne 0 ]; then
    echo "Error during LD matrix calculation. Check the log file for details."
    exit 1
fi

echo "LD matrix generation for chromosome $CHR completed successfully."

# Run with: 
nohup ./make_eur_filtered_matrix_maf.sh > make_eur_filtered_matrix_maf_log.txt 2>&1 &

# Dataframe version: 
nohup ./make_eur_filtered_df_maf.sh > make_eur_filtered_df_maf_log.txt 2>&1 &

2.15 Load LD Matrices

We load the LD matrices and SNP lists for both traits.

NB: Loaded below are the square LD matrices made with the --r-phased square flag. I also produced are the matrcies made with --r-phased flag. Those matrices contain fewer SNPs, which Grok suggests is due to the square matrices maybe including some “monomorphic” SNPs? I load the dataframe matrices later in the script, as they also contain explicit info for which allele plink used as the “major” (MAJ_A) allele.

# LD matrix and SNP list for Trait 1 (Exposure)
ld_matrix_file1 <- "LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_fin2.phased.vcor1"
snp_list_file1 <- "LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_fin2.phased.vcor1.vars"

# Load LD matrix and SNP list for Trait 1
ld_matrix1 <- as.matrix(read.table(ld_matrix_file1, header = FALSE, stringsAsFactors = FALSE))
snp_list1 <- read.table(snp_list_file1, header = FALSE, stringsAsFactors = FALSE)$V1
rownames(ld_matrix1) <- colnames(ld_matrix1) <- snp_list1

# Ensure LD matrix is symmetric
if (!isSymmetric(ld_matrix1)) {
  ld_matrix1 <- (ld_matrix1 + t(ld_matrix1)) / 2
}

# LD matrix and SNP list for Trait 2 (Outcome)
ld_matrix_file2 <- "LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_EUR2.phased.vcor1"
snp_list_file2 <- "LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_EUR2.phased.vcor1.vars"

# Load LD matrix and SNP list for Trait 2
ld_matrix2 <- as.matrix(read.table(ld_matrix_file2, header = FALSE, stringsAsFactors = FALSE))
snp_list2 <- read.table(snp_list_file2, header = FALSE, stringsAsFactors = FALSE)$V1
rownames(ld_matrix2) <- colnames(ld_matrix2) <- snp_list2

# Ensure LD matrix is symmetric
if (!isSymmetric(ld_matrix2)) {
  ld_matrix2 <- (ld_matrix2 + t(ld_matrix2)) / 2
}

3 SNP Matching and Data Alignment

3.1 Match SNPs Between Summary Statistics and LD Matrices

NB: Since the LD matrices are different (thought not hugely) for FIN and EUR populations, I selected only the snps that were common to both and our merged summary stats.

# Find intersecting SNPs
snps_trait1 <- intersect(summary_stats$SNP, snp_list1)
snps_trait2 <- intersect(summary_stats$SNP, snp_list2)

# Number of SNPs after matching
cat("Number of SNPs after matching with LD matrices:\n")
## Number of SNPs after matching with LD matrices:
cat("Trait 1:", length(snps_trait1), "\n")
## Trait 1: 2055
cat("Trait 2:", length(snps_trait2), "\n")
## Trait 2: 1967

3.2 Identify Common SNPs

# Find common SNPs between both traits
common_snps <- intersect(snps_trait1, snps_trait2)
cat("Number of common SNPs between both traits:", length(common_snps), "\n")
## Number of common SNPs between both traits: 1910

3.3 Subset Data to Common SNPs

# Subset summary statistics
summary_stats_filtered <- summary_stats %>%
  filter(SNP %in% common_snps) %>%
  arrange(match(SNP, common_snps))

# Subset LD matrices
ld_matrix1_filtered <- ld_matrix1[common_snps, common_snps]
ld_matrix2_filtered <- ld_matrix2[common_snps, common_snps]

# Verify dimensions
cat("After subsetting to common SNPs, dimensions:\n")
## After subsetting to common SNPs, dimensions:
cat("Summary Statistics:", dim(summary_stats_filtered), "\n")
## Summary Statistics: 1910 32
cat("LD Matrix Trait 1:", dim(ld_matrix1_filtered), "\n")
## LD Matrix Trait 1: 1910 1910
cat("LD Matrix Trait 2:", dim(ld_matrix2_filtered), "\n")
## LD Matrix Trait 2: 1910 1910
# Extract SNP order from the summary statistics
summary_snps <- summary_stats_filtered$SNP

# Extract SNP order from the LD matrices
ld_snps1 <- rownames(ld_matrix1_filtered)
ld_snps2 <- rownames(ld_matrix2_filtered)

# Check if the orders match
order_match_trait1 <- identical(summary_snps, ld_snps1)
order_match_trait2 <- identical(summary_snps, ld_snps2)

# Output the order match results
cat("SNP order matches between summary stats and LD matrix 1:", order_match_trait1, "\n")
## SNP order matches between summary stats and LD matrix 1: TRUE
cat("SNP order matches between summary stats and LD matrix 2:", order_match_trait2, "\n")
## SNP order matches between summary stats and LD matrix 2: TRUE
# If orders don't match, display mismatched SNPs
if (!order_match_trait1) {
  cat("SNPs in summary_stats_filtered but not in ld_matrix1_filtered:\n")
  print(setdiff(summary_snps, ld_snps1))
  cat("SNPs in ld_matrix1_filtered but not in summary_stats_filtered:\n")
  print(setdiff(ld_snps1, summary_snps))
}

if (!order_match_trait2) {
  cat("SNPs in summary_stats_filtered but not in ld_matrix2_filtered:\n")
  print(setdiff(summary_snps, ld_snps2))
  cat("SNPs in ld_matrix2_filtered but not in summary_stats_filtered:\n")
  print(setdiff(ld_snps2, summary_snps))
}

# # Reorder LD matrices to match the order of summary_snps (if needed)
# if (!order_match_trait1) {
#   cat("Reordering LD matrix 1 to match summary statistics order...\n")
#   ld_matrix1_filtered <- ld_matrix1_filtered[summary_snps, summary_snps]
# }
# 
# if (!order_match_trait2) {
#   cat("Reordering LD matrix 2 to match summary statistics order...\n")
#   ld_matrix2_filtered <- ld_matrix2_filtered[summary_snps, summary_snps]
# }

# Final verification after reordering
cat("Final dimensions of filtered data:\n")
## Final dimensions of filtered data:
cat("Summary Statistics:", dim(summary_stats_filtered), "\n")
## Summary Statistics: 1910 32
cat("LD Matrix Trait 1:", dim(ld_matrix1_filtered), "\n")
## LD Matrix Trait 1: 1910 1910
cat("LD Matrix Trait 2:", dim(ld_matrix2_filtered), "\n")
## LD Matrix Trait 2: 1910 1910
cat("Summary statistics and LD matrices are now aligned in terms of number of snps included and their order.\n")
## Summary statistics and LD matrices are now aligned in terms of number of snps included and their order.

5 Prepare Datasets for SuSiE and Coloc

We prepare the datasets required for the SuSiE and colocalization analyses.

# Extract the SNPs from no_mismatches_no_palindromes
snps_to_include2 <- no_mismatches_no_palindromes$SNP

# Subset ld_matrix1_filtered to include only the SNPs in snps_to_include
# Also ensure the order of the rows and columns follows the order in snps_to_include
ld_matrix1_filtered2 <- ld_matrix1_filtered[snps_to_include2, snps_to_include2]

# Subset ld_matrix2_filtered to include only the SNPs in snps_to_include
# Also ensure the order of the rows and columns follows the order in snps_to_include
ld_matrix2_filtered2 <- ld_matrix2_filtered[snps_to_include2, snps_to_include2]

# Verify the dimensions of the matrices to ensure they match the number of SNPs
dim(ld_matrix1_filtered2)
## [1] 1039 1039
dim(ld_matrix2_filtered2)
## [1] 1039 1039
# Dataset for Trait 1 (Exposure)
dataset1QC <- list(
  beta = no_mismatches_no_palindromes$beta.exposure,
  varbeta = (no_mismatches_no_palindromes$se.exposure)^2,
  type = "quant",
  snp = no_mismatches_no_palindromes$SNP,
  position = no_mismatches_no_palindromes$pos.exposure,
  N = no_mismatches_no_palindromes$samplesize.exposure[1],
  MAF = no_mismatches_no_palindromes$maf.exposure,
  LD = ld_matrix1_filtered2,
  sdY = 1
)

# Dataset for Trait 2 (Outcome)
dataset2QC <- list(
  beta = no_mismatches_no_palindromes$beta.outcome,
  varbeta = (no_mismatches_no_palindromes$se.outcome)^2,
  type = "quant",
  snp = no_mismatches_no_palindromes$SNP,
  position = no_mismatches_no_palindromes$pos.outcome,
  N = no_mismatches_no_palindromes$samplesize.outcome[1],
  MAF = no_mismatches_no_palindromes$maf.outcome,
  LD = ld_matrix2_filtered2,
  sdY = 1
)

6 SuSiE Fine-Mapping

6.1 Run SuSiE for Trait 1 (Exposure)

check_alignment(dataset1QC)

## [1] 0.6468739
cat("Running SuSiE for Trait 1 (Exposure)...\n")
## Running SuSiE for Trait 1 (Exposure)...
susie_trait1QC <- susie_rss(
  z = dataset1QC$beta / sqrt(dataset1QC$varbeta),
  R = dataset1QC$LD,
  n = dataset1QC$N,
  var_y = dataset1QC$sdY^2,
  L = 10,
  verbose = FALSE
)
cat("SuSiE run for Trait 1 completed.\n")
## SuSiE run for Trait 1 completed.
summary(susie_trait1QC)
## 
## Variables in credible sets:
## 
##  variable variable_prob cs
##       434    1.00000000  3
##       100    1.00000000  6
##       264    0.99992987  4
##        33    0.99864237  5
##       415    0.97212036 10
##       289    0.59193358  7
##       882    0.45489551  9
##       901    0.25180964  9
##       864    0.23183661  9
##       495    0.22192210  7
##       704    0.21833030  8
##       705    0.21833030  8
##       754    0.21833030  8
##        23    0.21162797  2
##       317    0.16140812  7
##       924    0.16104929  8
##       402    0.14207528  1
##       459    0.14207528  1
##       464    0.14207528  1
##       615    0.14207528  1
##       617    0.14207528  1
##       633    0.14207528  1
##       671    0.14207528  1
##        25    0.13079782  2
##        26    0.13079782  2
##       270    0.13079782  2
##       271    0.13079782  2
##       398    0.13079782  2
##       621    0.13079782  2
##       706    0.11891905  8
##       896    0.06503996  8
##       793    0.06145816  9
## 
## Credible sets summary:
## 
##  cs cs_log10bf cs_avg_r2 cs_min_r2                    variable
##   1   34.44827 1.0000000 1.0000000 402,459,464,615,617,633,671
##   2   48.62429 1.0000000 1.0000000    23,25,26,270,271,398,621
##   3   52.50596 1.0000000 1.0000000                         434
##   4   16.97373 1.0000000 1.0000000                         264
##   5   43.24439 1.0000000 1.0000000                          33
##   6   20.99722 1.0000000 1.0000000                         100
##   8   16.26745 1.0000000 1.0000000     704,705,706,754,896,924
##  10    4.53025 1.0000000 1.0000000                         415
##   7   33.32946 0.9848154 0.9772667                 289,317,495
##   9   10.99161 0.9762863 0.9528571             793,864,882,901
susie_trait1_runsusieQC <- runsusie(dataset1QC)
summary(susie_trait1_runsusieQC)
## 
## Variables in credible sets:
## 
##  variable variable_prob cs
##       434    1.00000000  3
##       100    1.00000000  6
##       264    0.99992987  4
##        33    0.99864237  5
##       415    0.97212036 10
##       289    0.59193358  7
##       882    0.45489551  9
##       901    0.25180964  9
##       864    0.23183661  9
##       495    0.22192210  7
##       704    0.21833030  8
##       705    0.21833030  8
##       754    0.21833030  8
##        23    0.21162797  2
##       317    0.16140812  7
##       924    0.16104929  8
##       402    0.14207528  1
##       459    0.14207528  1
##       464    0.14207528  1
##       615    0.14207528  1
##       617    0.14207528  1
##       633    0.14207528  1
##       671    0.14207528  1
##        25    0.13079782  2
##        26    0.13079782  2
##       270    0.13079782  2
##       271    0.13079782  2
##       398    0.13079782  2
##       621    0.13079782  2
##       706    0.11891905  8
##       896    0.06503996  8
##       793    0.06145816  9
## 
## Credible sets summary:
## 
##  cs cs_log10bf cs_avg_r2 cs_min_r2                    variable
##   1   34.44827 1.0000000 1.0000000 402,459,464,615,617,633,671
##   2   48.62429 1.0000000 1.0000000    23,25,26,270,271,398,621
##   3   52.50596 1.0000000 1.0000000                         434
##   4   16.97373 1.0000000 1.0000000                         264
##   5   43.24439 1.0000000 1.0000000                          33
##   6   20.99722 1.0000000 1.0000000                         100
##   8   16.26745 1.0000000 1.0000000     704,705,706,754,896,924
##  10    4.53025 1.0000000 1.0000000                         415
##   7   33.32946 0.9848154 0.9772667                 289,317,495
##   9   10.99161 0.9762863 0.9528571             793,864,882,901
#str(susie_trait1QC)

# Extract the variables data frame from the SuSiE object
variables_df <- susie_trait1_runsusieQC$variables

if(is.null(variables_df)) {
  # This path assumes 'variables' is within the 'sets' list, adjust if necessary
  variables_df <- data.frame(
    variable = unlist(lapply(susie_trait1_runsusieQC$sets$cs, function(x) x)),
    cs = rep(names(susie_trait1_runsusieQC$sets$cs), times = lengths(susie_trait1_runsusieQC$sets$cs)),
    variable_prob = susie_trait1_runsusieQC$pip[unlist(susie_trait1_runsusieQC$sets$cs)]
  )
}

# Sum 'variable_prob' by 'cs'
sum_by_cs <- aggregate(variable_prob ~ cs, data = variables_df, FUN = sum)

# Print or further use 'sum_by_cs'
print(sum_by_cs)
##     cs variable_prob
## 1   L1     0.9945270
## 2  L10     0.9721204
## 3   L2     0.9964149
## 4   L3     1.0000000
## 5   L4     0.9999299
## 6   L5     0.9986424
## 7   L6     1.0000000
## 8   L7     0.9752638
## 9   L8     0.9999992
## 10  L9     0.9999999
# Optionally, if you want to see which CS has the highest summed probability:
# print(sum_by_cs[which.max(sum_by_cs$variable_prob), ])

# Extract the variable strings for each credible set
susie_variable_strings_nat <- summary(susie_trait1QC)$cs$variable
susie_variable_strings_nat
##  [1] "402,459,464,615,617,633,671" "23,25,26,270,271,398,621"   
##  [3] "434"                         "264"                        
##  [5] "33"                          "100"                        
##  [7] "704,705,706,754,896,924"     "415"                        
##  [9] "289,317,495"                 "793,864,882,901"
# Split the strings by commas and convert them into a numeric vector
susie_variable_indices_nat  <- unlist(strsplit(susie_variable_strings_nat , ",")) %>% as.numeric()
susie_variable_indices_nat 
##  [1] 402 459 464 615 617 633 671  23  25  26 270 271 398 621 434 264  33 100 704
## [20] 705 706 754 896 924 415 289 317 495 793 864 882 901
# Extract the corresponding SNPs from the dataset using the indices
selected_snps_nat <- dataset1QC[["snp"]][susie_variable_indices_nat]

# Print the selected SNPs
print(selected_snps_nat)
##  [1] "rs116982412" "rs17138607"  "rs17319738"  "rs7478403"   "rs75016790" 
##  [6] "rs77547080"  "rs80107016"  "rs116849951" "rs12217763"  "rs12217793" 
## [11] "rs1052708"   "rs1052709"   "rs11599219"  "rs75591548"  "rs1334593"  
## [16] "rs1023275"   "rs45438202"  "rs4747286"   "rs10904798"  "rs10904800" 
## [21] "rs10904801"  "rs11254170"  "rs56077698"  "rs6602152"   "rs12253643" 
## [26] "rs10795404"  "rs10904760"  "rs34743968"  "rs12414052"  "rs3886147"  
## [31] "rs4748310"   "rs56366628"
see_snps_in_data_nat=no_mismatches_no_palindromes[which(no_mismatches_no_palindromes$SNP%in%selected_snps_nat),]
#head(see_snps_in_data_nat)
table(see_snps_in_data_nat$Gene)
## 
## C1QL3  CUBN  PTER  RSU1 
##     4     1    17    10
# Subset the LD matrix to include only SNPs in all credible sets
ld_matrix_selected_snps_nat <- ld_matrix1_filtered2[selected_snps_nat, selected_snps_nat]
str(ld_matrix_selected_snps_nat)
##  num [1:32, 1:32] 1 1 1 1 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:32] "rs116982412" "rs17138607" "rs17319738" "rs7478403" ...
##   ..$ : chr [1:32] "rs116982412" "rs17138607" "rs17319738" "rs7478403" ...
# Visualize the correlation matrix using corrplot for selected SNPs
corrplot(ld_matrix_selected_snps_nat, method = "color", tl.cex = 0.6, tl.col = "black", is.corr = TRUE)
mtext("LD between SNPs in all credible sets for PTER-NAT", side = 3, line = 5, cex = 1.2)

6.2 Run SuSiE for Trait 2 (Outcome / BMI)

check_alignment(dataset2QC)

## [1] 0.7886757
cat("Running SuSiE for Trait 2 (Exposure)...\n")
## Running SuSiE for Trait 2 (Exposure)...
susie_trait2QC <- susie_rss(
  z = dataset2QC$beta / sqrt(dataset2QC$varbeta),
  R = dataset2QC$LD,
  n = dataset2QC$N,
  var_y = dataset2QC$sdY^2,
  L = 10,
  verbose = FALSE
)
cat("SuSiE run for Trait 2 completed.\n")
## SuSiE run for Trait 2 completed.
summary(susie_trait2QC)
## 
## Variables in credible sets:
## 
##  variable variable_prob cs
##       505   0.895633002  1
##      1013   0.441353438  2
##       961   0.296927609  2
##       829   0.076369709  2
##       512   0.053976420  1
##       695   0.048590964  2
##       286   0.033794794  1
##       881   0.031710212  2
##       828   0.019662102  2
##       694   0.013135876  2
##      1024   0.011692712  2
##       836   0.010036937  2
##       885   0.009677330  2
##       744   0.008171265  2
##       967   0.007566711  2
##       939   0.007397615  2
##       974   0.006877593  2
## 
## Credible sets summary:
## 
##  cs cs_log10bf cs_avg_r2 cs_min_r2
##   1   2.931517 0.8552187 0.7919149
##   2   1.945980 0.6929520 0.5315649
##                                                   variable
##                                                286,505,512
##  694,695,744,828,829,836,881,885,939,961,967,974,1013,1024
susie_trait2_runsusieQC <- runsusie(dataset2QC)
summary(susie_trait2_runsusieQC)
## 
## Variables in credible sets:
## 
##  variable variable_prob cs
##       505   0.895633002  1
##      1013   0.441353438  2
##       961   0.296927609  2
##       829   0.076369709  2
##       512   0.053976420  1
##       695   0.048590964  2
##       286   0.033794794  1
##       881   0.031710212  2
##       828   0.019662102  2
##       694   0.013135876  2
##      1024   0.011692712  2
##       836   0.010036937  2
##       885   0.009677330  2
##       744   0.008171265  2
##       967   0.007566711  2
##       939   0.007397615  2
##       974   0.006877593  2
## 
## Credible sets summary:
## 
##  cs cs_log10bf cs_avg_r2 cs_min_r2
##   1   2.931517 0.8552187 0.7919149
##   2   1.945980 0.6929520 0.5315649
##                                                   variable
##                                                286,505,512
##  694,695,744,828,829,836,881,885,939,961,967,974,1013,1024
#str(susie_trait2QC)

# Extract the variables data frame from the SuSiE object
variables_df_bmi <- susie_trait2_runsusieQC$variables

if(is.null(variables_df_bmi)) {
  # This path assumes 'variables' is within the 'sets' list, adjust if necessary
  variables_df <- data.frame(
    variable = unlist(lapply(susie_trait2_runsusieQC$sets$cs, function(x) x)),
    cs = rep(names(susie_trait2_runsusieQC$sets$cs), times = lengths(susie_trait2_runsusieQC$sets$cs)),
    variable_prob = susie_trait2_runsusieQC$pip[unlist(susie_trait2_runsusieQC$sets$cs)]
  )
}

# Sum 'variable_prob' by 'cs'
sum_by_cs_bmi <- aggregate(variable_prob ~ cs, data = variables_df, FUN = sum)

# Print or further use 'sum_by_cs'
print(sum_by_cs_bmi)
##   cs variable_prob
## 1 L1     0.9834042
## 2 L2     0.9891701
# Extract the variable strings for each credible set
susie_variable_strings_bmi <- summary(susie_trait2QC)$cs$variable
susie_variable_strings_bmi
## [1] "286,505,512"                                              
## [2] "694,695,744,828,829,836,881,885,939,961,967,974,1013,1024"
# Split the strings by commas and convert them into a numeric vector
susie_variable_indices_bmi  <- unlist(strsplit(susie_variable_strings_bmi , ",")) %>% as.numeric()
susie_variable_indices_bmi 
##  [1]  286  505  512  694  695  744  828  829  836  881  885  939  961  967  974
## [16] 1013 1024
# Extract the corresponding SNPs from the dataset using the indices
selected_snps_bmi <- dataset2QC[["snp"]][susie_variable_indices_bmi]

# Print the selected SNPs
print(selected_snps_bmi)
##  [1] "rs10795400" "rs3898395"  "rs45578234" "rs10795417" "rs10795424"
##  [6] "rs11254144" "rs2253538"  "rs2356378"  "rs2883918"  "rs4747283" 
## [11] "rs4748314"  "rs7069573"  "rs7087428"  "rs7089544"  "rs7092558" 
## [16] "rs7909091"  "rs7916182"
see_snps_in_data_bmi=no_mismatches_no_palindromes[which(no_mismatches_no_palindromes$SNP%in%selected_snps_bmi),]
#head(see_snps_in_data_bmi)
table(see_snps_in_data_bmi$Gene)
## 
## PTER RSU1 
##    3   14
# Subset the LD matrix to include only SNPs in all credible sets
ld_matrix_selected_snps_bmi <- ld_matrix2_filtered2[selected_snps_bmi, selected_snps_bmi]
str(ld_matrix_selected_snps_bmi)
##  num [1:17, 1:17] 1 0.9736 0.8899 0.0774 0.0364 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:17] "rs10795400" "rs3898395" "rs45578234" "rs10795417" ...
##   ..$ : chr [1:17] "rs10795400" "rs3898395" "rs45578234" "rs10795417" ...
# Visualize the correlation matrix using corrplot for selected SNPs
corrplot(ld_matrix_selected_snps_bmi, method = "color", tl.cex = 0.6, tl.col = "black", is.corr = TRUE)
mtext("LD between SNPs in all credible sets for BMI", side = 3, line = 5, cex = 1.2)

7 Colocalization Analysis with coloc.susie

cat("Running coloc.susie analysis...\n")
## Running coloc.susie analysis...
coloc_result_nat_bmi <- coloc.susie(susie_trait1QC, susie_trait2QC)
cat("coloc.susie analysis completed.\n")
## coloc.susie analysis completed.
coloc_result_nat_bmi$summary
coloc_summary <- coloc_result_nat_bmi$summary
print(coloc_summary)
##     nsnps        hit1      hit2    PP.H0.abf  PP.H1.abf    PP.H2.abf PP.H3.abf
##     <int>      <char>    <char>        <num>      <num>        <num>     <num>
##  1:  1039 rs116982412 rs3898395 3.790197e-36 0.01105482 3.363525e-34 0.9810192
##  2:  1039 rs116849951 rs3898395 2.546441e-50 0.01113887 2.259782e-48 0.9884935
##  3:  1039   rs1334593 rs3898395 3.327858e-54 0.01108509 2.953232e-52 0.9837105
##  4:  1039   rs1023275 rs3898395 1.139228e-18 0.01114174 1.010982e-16 0.9887480
##  5:  1039  rs45438202 rs3898395 6.108686e-45 0.01114184 5.421016e-43 0.9887576
##  6:  1039   rs4747286 rs3898395 1.078828e-22 0.01113754 9.573817e-21 0.9883746
##  7:  1039  rs10904798 rs3898395 5.792828e-18 0.01114176 5.140714e-16 0.9887501
##  8:  1039  rs12253643 rs3898395 3.161618e-06 0.01113716 2.805706e-04 0.9883420
##  9:  1039  rs10795404 rs3898395 5.021064e-35 0.01113959 4.455830e-33 0.9885568
## 10:  1039   rs4748310 rs3898395 1.093278e-12 0.01114175 9.702053e-11 0.9887495
## 11:  1039 rs116982412 rs7909091 3.365728e-35 0.09816777 3.087981e-34 0.9006651
## 12:  1039 rs116849951 rs7909091 2.239399e-49 0.09795782 2.054599e-48 0.8987345
## 13:  1039   rs1334593 rs7909091 2.947486e-53 0.09818067 2.704252e-52 0.9007837
## 14:  1039   rs1023275 rs7909091 1.003723e-17 0.09816480 9.208931e-17 0.9006377
## 15:  1039  rs45438202 rs7909091 5.381938e-44 0.09816301 4.937808e-43 0.9006213
## 16:  1039   rs4747286 rs7909091 9.446835e-22 0.09752663 8.667261e-21 0.8947697
## 17:  1039  rs10904798 rs7909091 5.103606e-17 0.09816129 4.682444e-16 0.9006055
## 18:  1039  rs12253643 rs7909091 2.784451e-05 0.09808552 2.554672e-04 0.8999093
## 19:  1039  rs10795404 rs7909091 4.400937e-34 0.09763789 4.037762e-33 0.8957927
## 20:  1039   rs4748310 rs7909091 9.631942e-12 0.09816047 8.837092e-11 0.9005979
##     nsnps        hit1      hit2    PP.H0.abf  PP.H1.abf    PP.H2.abf PP.H3.abf
##        PP.H4.abf  idx1  idx2
##            <num> <int> <int>
##  1: 0.0079260103     1     1
##  2: 0.0003675847     2     1
##  3: 0.0052044299     3     1
##  4: 0.0001102684     4     1
##  5: 0.0001005537     5     1
##  6: 0.0004878450     6     1
##  7: 0.0001081701     8     1
##  8: 0.0002370620    10     1
##  9: 0.0003036530     7     1
## 10: 0.0001087463     9     1
## 11: 0.0011671681     1     2
## 12: 0.0033076683     2     2
## 13: 0.0010356702     3     2
## 14: 0.0011974990     4     2
## 15: 0.0012156579     5     2
## 16: 0.0077036677     6     2
## 17: 0.0012332122     8     2
## 18: 0.0017218714    10     2
## 19: 0.0065693692     7     2
## 20: 0.0012416361     9     2
##        PP.H4.abf  idx1  idx2
# Save coloc.susie results
saveRDS(coloc_result_nat_bmi, file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/coloc_result_nat_bmi.rds")

# Extract and save PP0 to PP4
pp_values <- as.data.frame(t(coloc_result_nat_bmi$summary))
colnames(pp_values) <- c("PP0", "PP1", "PP2", "PP3", "PP4")
write.table(pp_values, file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/coloc_pp_values_coloc_result_nat_bmi.txt",
            sep = "\t", quote = FALSE, row.names = FALSE)

# Display PP values
cat("PP0 to PP4 values:\n")
## PP0 to PP4 values:
print(pp_values)
##                    PP0          PP1          PP2          PP3          PP4
## nsnps             1039         1039         1039         1039         1039
## hit1       rs116982412  rs116849951    rs1334593    rs1023275   rs45438202
## hit2         rs3898395    rs3898395    rs3898395    rs3898395    rs3898395
## PP.H0.abf 3.790197e-36 2.546441e-50 3.327858e-54 1.139228e-18 6.108686e-45
## PP.H1.abf   0.01105482   0.01113887   0.01108509   0.01114174   0.01114184
## PP.H2.abf 3.363525e-34 2.259782e-48 2.953232e-52 1.010982e-16 5.421016e-43
## PP.H3.abf    0.9810192    0.9884935    0.9837105    0.9887480    0.9887576
## PP.H4.abf 0.0079260103 0.0003675847 0.0052044299 0.0001102684 0.0001005537
## idx1                 1            2            3            4            5
## idx2                 1            1            1            1            1
##                     NA           NA           NA           NA           NA
## nsnps             1039         1039         1039         1039         1039
## hit1         rs4747286   rs10904798   rs12253643   rs10795404    rs4748310
## hit2         rs3898395    rs3898395    rs3898395    rs3898395    rs3898395
## PP.H0.abf 1.078828e-22 5.792828e-18 3.161618e-06 5.021064e-35 1.093278e-12
## PP.H1.abf   0.01113754   0.01114176   0.01113716   0.01113959   0.01114175
## PP.H2.abf 9.573817e-21 5.140714e-16 2.805706e-04 4.455830e-33 9.702053e-11
## PP.H3.abf    0.9883746    0.9887501    0.9883420    0.9885568    0.9887495
## PP.H4.abf 0.0004878450 0.0001081701 0.0002370620 0.0003036530 0.0001087463
## idx1                 6            8           10            7            9
## idx2                 1            1            1            1            1
##                     NA           NA           NA           NA           NA
## nsnps             1039         1039         1039         1039         1039
## hit1       rs116982412  rs116849951    rs1334593    rs1023275   rs45438202
## hit2         rs7909091    rs7909091    rs7909091    rs7909091    rs7909091
## PP.H0.abf 3.365728e-35 2.239399e-49 2.947486e-53 1.003723e-17 5.381938e-44
## PP.H1.abf   0.09816777   0.09795782   0.09818067   0.09816480   0.09816301
## PP.H2.abf 3.087981e-34 2.054599e-48 2.704252e-52 9.208931e-17 4.937808e-43
## PP.H3.abf    0.9006651    0.8987345    0.9007837    0.9006377    0.9006213
## PP.H4.abf 0.0011671681 0.0033076683 0.0010356702 0.0011974990 0.0012156579
## idx1                 1            2            3            4            5
## idx2                 2            2            2            2            2
##                     NA           NA           NA           NA           NA
## nsnps             1039         1039         1039         1039         1039
## hit1         rs4747286   rs10904798   rs12253643   rs10795404    rs4748310
## hit2         rs7909091    rs7909091    rs7909091    rs7909091    rs7909091
## PP.H0.abf 9.446835e-22 5.103606e-17 2.784451e-05 4.400937e-34 9.631942e-12
## PP.H1.abf   0.09752663   0.09816129   0.09808552   0.09763789   0.09816047
## PP.H2.abf 8.667261e-21 4.682444e-16 2.554672e-04 4.037762e-33 8.837092e-11
## PP.H3.abf    0.8947697    0.9006055    0.8999093    0.8957927    0.9005979
## PP.H4.abf 0.0077036677 0.0012332122 0.0017218714 0.0065693692 0.0012416361
## idx1                 6            8           10            7            9
## idx2                 2            2            2            2            2

8 Visualization

8.1 Prepare Data for Manhattan Plot for NAT

# used "summary_stats_filtered" originally 

# Define genome-wide significance threshold
genomewide_sig_threshold <- -log10(5e-8)

# Prepare data for Manhattan plot
no_mismatches_no_palindromes$log_pval <- -log10(no_mismatches_no_palindromes$pval.exposure)

# Convert to data frame if necessary
if (is.data.table(no_mismatches_no_palindromes)) {
  no_mismatches_no_palindromes_df <- as.data.frame(no_mismatches_no_palindromes)
} else {
  no_mismatches_no_palindromes_df <- no_mismatches_no_palindromes
}


# Prepare data for labeling
snps_to_label <- unique(c(selected_snps_nat)) #, selected_snps_bmi
snps_label_data <- no_mismatches_no_palindromes_df %>%
  dplyr::filter(SNP %in% snps_to_label) %>%
  dplyr::select(SNP, pos.exposure, log_pval, Gene)

8.2 Create Manhattan Plot with Labeled SNPs for NAT

# Create Manhattan Plot
manhattan_plot <- ggplot(no_mismatches_no_palindromes_df, aes(x = pos.exposure, y = log_pval)) +
  geom_point(aes(color = as.factor(chr.exposure)), alpha = 0.75, size = 1.3) +
  scale_color_manual(values = rep(c("blue", "orange"), length(unique(no_mismatches_no_palindromes_df$chr.exposure)))) +
  labs(
    x = "Genomic Position (Chromosome 10)",
    y = expression(-log[10](italic(p)-value)),
    title = "Manhattan Plot with SuSiE Credible Set SNPs Labeled"
  ) +
  geom_hline(
    yintercept = genomewide_sig_threshold,
    linetype = "dotted",
    color = "red",
    linewidth = 1
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
  ) +
  geom_text_repel(
    data = snps_label_data,
    aes(label = paste(SNP, Gene, sep = " - ")),
    nudge_y = 1,
    size = 3,
    color = "black",
    max.overlaps = Inf,
    box.padding = 0.5,
    point.padding = 0.5,
    segment.color = "grey50"
  )

# Display the plot
print(manhattan_plot)

# Save the plot
output_dir <- "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/figures"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}
ggsave(
  filename = file.path(output_dir, "manhattan_plot_susie_snps_no_mismatches_no_palindromes_df.pdf"),
  plot = manhattan_plot,
  width = 10,
  height = 6,
  dpi = 300
)

9 Preliminary Conclusion

In this pleliminary analysis, we performed fine-mapping using SuSiE for both NAT and BMI traits, identified credible sets, and conducted colocalization analysis using coloc.susie. The Manhattan plot visualizes SNPs within the region of interest and highlights those within the credible sets for NAT. The NAT data contained four credible sets containing 32 SNPs from four nearest genes: C1QL3, CUBN, PTER, and RSU1. The BMI data contained two credible sets with SNPs with PTER and RSU1 as the nearest genes. There wasn’t a highly significant hit for BMI in the studied region, though coloc suggested evidence of separate causal SNPs for NAT and BMI, supposing non-genome-wide hits for BMI are causal.

Notably, this analysis improves upon its legacy version by identifying and removing palindromic SNPs and ensuring that the directionality of all included SNPs are aligned between the four data sets: NAT, BMI, and their respective reference-panel LD matrices. The next time we do this, maybe we should harmonize the NAT and BMI data without the harmonize_data() package or consider including a way to retain multi-allelic SNPs (i.e., those that have multiple minor alleles circulating at a reasonable frequency in the population).

10 Appendix

11 Save the R Environment

# Save the entire R environment
save.image(file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/full_environment.RData")
cat("Entire R environment saved.\n")

12 Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] biomaRt_2.61.3      readxl_1.4.3        TwoSampleMR_0.6.8  
##  [4] corrplot_0.94       tidyr_1.3.1         ggrepel_0.9.6      
##  [7] data.table_1.16.2   dplyr_1.1.4         openxlsx_4.2.7.1   
## [10] RhpcBLASctl_0.23-42 ggplot2_3.5.1       susieR_0.12.35     
## [13] coloc_5.2.3        
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3               gridExtra_2.3           httr2_1.0.5            
##  [4] rlang_1.1.4             magrittr_2.0.3          matrixStats_1.4.1      
##  [7] compiler_4.4.1          RSQLite_2.3.7           systemfonts_1.1.0      
## [10] png_0.1-8               vctrs_0.6.5             RcppZiggurat_0.1.6     
## [13] stringr_1.5.1           pkgconfig_2.0.3         crayon_1.5.3           
## [16] fastmap_1.2.0           dbplyr_2.5.0            XVector_0.45.0         
## [19] labeling_0.4.3          utf8_1.2.4              rmarkdown_2.28         
## [22] UCSC.utils_1.1.0        ragg_1.3.3              purrr_1.0.2            
## [25] bit_4.5.0               xfun_0.47               Rfast_2.1.0            
## [28] zlibbioc_1.51.1         cachem_1.1.0            GenomeInfoDb_1.41.1    
## [31] jsonlite_1.8.9          progress_1.2.3          blob_1.2.4             
## [34] highr_0.11              reshape_0.8.9           irlba_2.3.5.1          
## [37] parallel_4.4.1          prettyunits_1.2.0       R6_2.5.1               
## [40] bslib_0.8.0             stringi_1.8.4           jquerylib_0.1.4        
## [43] cellranger_1.1.0        Rcpp_1.0.13             knitr_1.48             
## [46] IRanges_2.39.2          Matrix_1.7-0            tidyselect_1.2.1       
## [49] rstudioapi_0.16.0       yaml_2.3.10             viridis_0.6.5          
## [52] curl_5.2.3              lattice_0.22-6          tibble_3.2.1           
## [55] plyr_1.8.9              Biobase_2.65.1          withr_3.0.1            
## [58] KEGGREST_1.45.1         evaluate_1.0.0          RcppParallel_5.1.9     
## [61] BiocFileCache_2.13.0    zip_2.3.1               xml2_1.3.6             
## [64] Biostrings_2.73.2       pillar_1.9.0            filelock_1.0.3         
## [67] stats4_4.4.1            generics_0.1.3          S4Vectors_0.43.2       
## [70] hms_1.1.3               munsell_0.5.1           scales_1.3.0           
## [73] glue_1.8.0              tools_4.4.1             grid_4.4.1             
## [76] AnnotationDbi_1.67.0    colorspace_2.1-1        GenomeInfoDbData_1.2.12
## [79] cli_3.6.3               rappdirs_0.3.3          textshaping_0.4.0      
## [82] fansi_1.0.6             mixsqp_0.3-54           viridisLite_0.4.2      
## [85] gtable_0.3.5            sass_0.4.9              digest_0.6.37          
## [88] BiocGenerics_0.51.1     farver_2.1.2            memoise_2.0.1          
## [91] htmltools_0.5.8.1       lifecycle_1.0.4         httr_1.4.7             
## [94] bit64_4.5.2