This analysis investigates the potential genetic overlap between N-acetyltaurine (NAT) and Body Mass Index (BMI) using summary statistics, SuSiE fine-mapping, and colocalization analysis. We focus on a 500KB region upstream and downstream of the sentinel SNP rs147852470 and include only variants with a MAF >0.05. (We will create a separate document to examine MAF >0.01.)
Initially, I obtained the summary statistics for the METSIM NAT data
and Jurgen’s (UKB) BMI data and noted they were on different builds.
After using LiftOver to get the BMI positions for GRCh38, I loaded the
data into RStudio to look at it and harmonized it within TwoSampleMR via
harmonize_data(). That code is below.
NB: I subsequently learned that
harmonize_data() worked as expected except for
multi-allelic variants, which I dealt with later by excluding them and
palindromes prior to running SuSiE.
# ---------------------------------------------
# Load Necessary Packages
# ---------------------------------------------
library(coloc) # For coloc.abf function
library(data.table) # For fread function
library(dplyr) # For data manipulation functions like mutate, filter
library(ggplot2) # For plotting
library(readr) # For read_table2 function
library(TwoSampleMR) # For harmonise_data function
library(readxl) # For reading Excel files
library(biomaRt) # For merging in TSS info using Ensembl
# ---------------------------------------------
# Load and Format NAT Data
# ---------------------------------------------
# NB: The BP_GRCh37 is actually BP_GRCh38. Testing liftOver confirmed the data is already on GRCh38.
# The sample size is 6099 for NAT from the PheWeb data.
nat_mr <- fread("/Users/charleenadams/temp_BI/phenocode-C100005466_hg38_merged_nat_twosample.tsv")
nat_mr <- as.data.frame(nat_mr)
# Filter out rows with missing or invalid RSIDs
filtered_nat_df <- nat_mr %>%
filter(!is.na(rsids) & rsids != "" & grepl("^rs", rsids)) %>%
arrange(rsids)
# Format harmonized data for exposure
exp_dat <- filtered_nat_df %>%
mutate(chr.exposure = chrom,
pos.exposure = BP_GRCh37, # This is actually BP_GRCh38
beta.exposure = beta,
se.exposure = se,
exposure = "N-acetyltaurine",
id.exposure = "N-acetyltaurine",
pval.exposure = pval,
SNP.exposure = SNP,
SNP = SNP,
effect_allele.exposure = effect_allele,
other_allele.exposure = other_allele,
eaf.exposure = eaf,
samplesize.exposure = 6099,
id_col = nearest_genes)
# ---------------------------------------------
# Load and Format BMI Data
# ---------------------------------------------
# NB: The sample size is 460000 from the Jurgens 2022 README
# Filter out rows with missing or invalid SNPs
filtered_bmi_df <- bmi_mr %>%
filter(!is.na(SNP) & SNP != "" & grepl("^rs", SNP)) %>%
arrange(SNP)
# Format harmonized data for outcome
out_dat <- filtered_bmi_df %>%
mutate(SNP = SNP,
SNP.outcome = SNP,
chr.outcome = CHR_x,
pos.outcome = BP_GRCh38,
beta.outcome = beta,
se.outcome = se,
pval.outcome = pval,
effect_allele.outcome = effect_allele,
other_allele.outcome = other_allele,
eaf.outcome = eaf,
samplesize.outcome = 460000,
id.outcome = "BMI",
outcome = "BMI")
# ---------------------------------------------
# Harmonize Data (Exposure and Outcome)
# ---------------------------------------------
dat <- harmonise_data(
exposure_dat = exp_dat,
outcome_dat = out_dat
)
# Save harmonized data
write.csv(dat, file = "/Users/charleenadams/temp_BI/harmonized_nat_BMI_dat.csv", row.names = FALSE)
# Reload harmonized data to ensure consistency
dat <- fread("/Users/charleenadams/temp_BI/harmonized_nat_BMI_dat.csv")
# ---------------------------------------------
# Remove Duplicate SNP Entries: We may LEGACY this but keeping for now
# ---------------------------------------------
# NB: I had initially kept only the first entry where an rsid was duplicated, which
# did not address what happens within the harmonize_data() process for multi-allelic variants....
# Remove duplicates based on SNP
# dups <- dat[duplicated(dat$SNP),]
# dups <- dups[order(dups$SNP),]
# dat_unique <- dat[!duplicated(dat$SNP), ]
# dim(dat_unique)
# ---------------------------------------------
# ---------------------------------------------
# Look at multi-allelic variants and harmonize
# ---------------------------------------------
harmonized_data <- dat %>%
filter(
effect_allele.exposure == effect_allele.outcome &
other_allele.exposure == other_allele.outcome
)
# Check the first few rows of the filtered data to ensure it looks correct
dim(harmonized_data)
## [1] 11588423 57
harmonized_data <- harmonized_data[order(harmonized_data$SNP),]
dup_snps <- harmonized_data %>%
group_by(SNP) %>%
filter(n() > 1)
# View the duplicates if any are present
# head(dup_snps)
# Get a count of how many SNPs have duplicates
dup_count <- n_distinct(dup_snps$SNP)
print(paste("Number of SNPs with duplicates:", dup_count))
## [1] "Number of SNPs with duplicates: 23568"
# Define a function to check if a SNP is palindromic
is_palindromic <- function(effect_allele, other_allele) {
# Define palindromic pairs
palindromic_pairs <- list(c("A", "T"), c("T", "A"), c("C", "G"), c("G", "C"))
# Check if the provided alleles form a palindromic pair
return(any(mapply(function(x, y) all(c(effect_allele, other_allele) %in% c(x, y)),
sapply(palindromic_pairs, `[[`, 1), sapply(palindromic_pairs, `[[`, 2))))
}
# Apply the function to check for palindromic SNPs in your data
dup_snps <- dup_snps %>%
mutate(is_palindromic = mapply(is_palindromic, effect_allele.exposure, other_allele.exposure))
# Filter and view only palindromic SNPs
palindromic_snps <- dup_snps %>% filter(is_palindromic)
# View the results
# print(palindromic_snps)
dim(palindromic_snps)
## [1] 12557 58
dim(dup_snps)
## [1] 53414 58
# Identify SNPs with multiple combinations of alleles
multi_allelic_snps <- dup_snps %>%
group_by(SNP) %>%
summarize(
unique_allele_combinations = n_distinct(paste(effect_allele.exposure, other_allele.exposure)),
.groups = 'drop'
) %>%
filter(unique_allele_combinations > 1)
# View the results
# print(multi_allelic_snps)
# Count the number of SNPs that are multi-allelic
multi_allelic_count <- nrow(multi_allelic_snps)
print(paste("Number of multi-allelic SNPs:", multi_allelic_count))
## [1] "Number of multi-allelic SNPs: 1008"
multi_allelic_df=dup_snps[which(dup_snps$SNP%in%multi_allelic_snps$SNP),]
# head(multi_allelic_df)
multi_allelic_df <- as.data.frame(multi_allelic_df)
# head(multi_allelic_df$pos.exposure)
dim(multi_allelic_df)
## [1] 8113 58
dup_snps=as.data.frame(dup_snps)
# head(dup_snps)
dup_snps$SNP_index=NULL
# Identify rows that are exact duplicates based on all columns except 'SNP'
exact_duplicates <- dup_snps %>%
group_by(SNP) %>%
filter(duplicated(across(everything())) | duplicated(across(everything()), fromLast = TRUE))
# Count the number of exact duplicates
exact_dup_count <- nrow(exact_duplicates)
print(paste("Number of exact duplicate rows:", exact_dup_count))
## [1] "Number of exact duplicate rows: 53408"
# View a few exact duplicates
# print(head(exact_duplicates))
# ---------------------------------------------
# Return to the dat file as processed originally without doing anything yet
# about multi-allelic variants or palindromes
# ---------------------------------------------
# Check effect allele frequencies (EAF) differences
dat$eaf_diff <- abs(dat$eaf.exposure - dat$eaf.outcome)
summary(dat$eaf_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.004719 0.016603 0.077164 0.050143 0.998914
# ---------------------------------------------
# Prepare Data for Coloc Analysis
# For the rest of this, stick with the original removal of dups,
# keeping the first entry. We address this later.
# ---------------------------------------------
# Select relevant columns
columns_to_keep <- c(
"SNP", "chr.exposure", "chr.outcome", "pos.exposure", "pos.outcome", "effect_allele.exposure",
"other_allele.exposure", "effect_allele.outcome", "other_allele.outcome", "beta.exposure",
"beta.outcome", "eaf.exposure", "eaf.outcome", "se.exposure", "se.outcome", "exposure",
"id.exposure", "outcome", "id.outcome", "pval.exposure", "pval.outcome", "samplesize.exposure",
"samplesize.outcome", "id_col", "mr_keep"
)
# Create the coloc-ready dataset
dat_unique <- dat[!duplicated(dat$SNP), ]
dat_unique <- setDT(dat_unique)
str(dat_unique)
## Classes 'data.table' and 'data.frame': 11571724 obs. of 58 variables:
## $ SNP : chr "rs10" "rs1000000" "rs10000000" "rs10000003" ...
## $ effect_allele.exposure: chr "C" "A" "T" "G" ...
## $ other_allele.exposure : chr "A" "G" "A" "A" ...
## $ effect_allele.outcome : chr "C" "A" "T" "G" ...
## $ other_allele.outcome : chr "A" "G" "A" "A" ...
## $ beta.exposure : num -0.035 -0.025 -0.018 -0.026 -0.0042 0.061 -0.03 -0.018 0.017 -0.032 ...
## $ beta.outcome : num -0.000166 0.000529 -0.009911 0.001551 -0.002036 ...
## $ eaf.exposure : num 0.075 0.23 0.073 0.33 0.49 0.0048 0.0093 0.011 0.43 0.049 ...
## $ eaf.outcome : num 0.9436 0.2237 0.0601 0.7057 0.5422 ...
## $ remove : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ palindromic : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ ambiguous : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ id.outcome : chr "BMI" "BMI" "BMI" "BMI" ...
## $ CHR_x : int 7 12 4 4 4 4 4 4 4 4 ...
## $ BP : int 92383888 126890980 40088896 57561647 85161558 108826383 114553253 172776204 21618674 138223055 ...
## $ effect_allele.x : chr "A" "G" "A" "A" ...
## $ other_allele.x : chr "C" "A" "T" "G" ...
## $ eaf.x : num 0.0564 0.7763 0.9399 0.2943 0.4578 ...
## $ INFO : num 0.939 0.999 0.982 0.998 0.994 ...
## $ pval.x : num 0.9685 0.8147 0.0144 0.4536 0.284 ...
## $ beta.x : num 0.000166 -0.000529 0.009911 -0.001551 0.002036 ...
## $ se.x : num 0.00421 0.00226 0.00405 0.00207 0.0019 ...
## $ CHR_y : chr "chr7" "chr12" "chr4" "chr4" ...
## $ BP_GRCh38.x : num 9.28e+07 1.26e+08 4.01e+07 5.67e+07 8.42e+07 ...
## $ SNP.outcome : chr "rs10" "rs1000000" "rs10000000" "rs10000003" ...
## $ chr.outcome : int 7 12 4 4 4 4 4 4 4 4 ...
## $ pos.outcome : num 9.28e+07 1.26e+08 4.01e+07 5.67e+07 8.42e+07 ...
## $ se.outcome : num 0.00421 0.00226 0.00405 0.00207 0.0019 ...
## $ pval.outcome : num 0.9685 0.8147 0.0144 0.4536 0.284 ...
## $ samplesize.outcome : int 460000 460000 460000 460000 460000 460000 460000 460000 460000 460000 ...
## $ outcome : chr "BMI" "BMI" "BMI" "BMI" ...
## $ chrom : int 7 12 4 4 4 4 4 4 4 4 ...
## $ BP_GRCh37 : int 92754574 126406434 40087276 56695481 84240405 107905227 113632097 171855053 21617051 137301901 ...
## $ other_allele.y : chr "A" "G" "A" "A" ...
## $ effect_allele.y : chr "C" "A" "T" "G" ...
## $ rsids : chr "rs10" "rs1000000" "rs10000000" "rs10000003" ...
## $ nearest_genes : chr "CDK6" "TMEM132B" "N4BP2" "HOPX" ...
## $ pval.y : num 0.31 0.25 0.62 0.18 0.82 0.65 0.76 0.84 0.36 0.45 ...
## $ beta.y : num -0.035 -0.025 -0.018 -0.026 -0.0042 0.061 -0.03 -0.018 0.017 -0.032 ...
## $ se.y : num 0.035 0.022 0.035 0.019 0.018 0.13 0.095 0.087 0.018 0.042 ...
## $ eaf.y : num 0.075 0.23 0.073 0.33 0.49 0.0048 0.0093 0.011 0.43 0.049 ...
## $ ac : num 11282 2811 896 8214 5917 ...
## $ r2 : num 1.7e-04 2.2e-04 4.1e-05 3.0e-04 8.9e-06 3.4e-05 1.6e-05 6.6e-06 1.4e-04 9.5e-05 ...
## $ CHR : chr "chr7" "chr12" "chr4" "chr4" ...
## $ BP_GRCh38.y : num 9.31e+07 1.26e+08 4.01e+07 5.58e+07 8.33e+07 ...
## $ chr.exposure : int 7 12 4 4 4 4 4 4 4 4 ...
## $ pos.exposure : int 92754574 126406434 40087276 56695481 84240405 107905227 113632097 171855053 21617051 137301901 ...
## $ se.exposure : num 0.035 0.022 0.035 0.019 0.018 0.13 0.095 0.087 0.018 0.042 ...
## $ exposure : chr "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" ...
## $ id.exposure : chr "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" "N-acetyltaurine" ...
## $ pval.exposure : num 0.31 0.25 0.62 0.18 0.82 0.65 0.76 0.84 0.36 0.45 ...
## $ SNP.exposure : chr "rs10" "rs1000000" "rs10000000" "rs10000003" ...
## $ samplesize.exposure : int 6099 6099 6099 6099 6099 6099 6099 6099 6099 6099 ...
## $ id_col : chr "CDK6" "TMEM132B" "N4BP2" "HOPX" ...
## $ action : int 2 2 2 2 2 2 2 2 2 2 ...
## $ SNP_index : int 1 1 1 1 1 1 1 1 1 1 ...
## $ mr_keep : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ eaf_diff : num 0.86861 0.00631 0.01292 0.37568 0.05225 ...
## - attr(*, ".internal.selfref")=<externalptr>
dat_coloc_ready <- dat_unique[, ..columns_to_keep]
dat_coloc_ready$Gene <- dat_coloc_ready$id_col
# ---------------------------------------------
# Retrieve TSS Information Using biomaRt
# ---------------------------------------------
# Connect to Ensembl to retrieve gene information
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
print("Connected to Ensembl")
## [1] "Connected to Ensembl"
# Retrieve TSS information
gene_list <- unique(dat_coloc_ready$id_col)
tss_data <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", "transcription_start_site"),
filters = "hgnc_symbol",
values = gene_list,
mart = ensembl)
# Rename columns for better clarity
colnames(tss_data) <- c("Ensembl_ID", "Gene", "Chromosome", "TSS")
# ---------------------------------------------
# Merge TSS Data with Coloc-Ready Data
# ---------------------------------------------
# Convert to data.table for merging
dat_dt <- as.data.table(dat_coloc_ready)
tss_data_dt <- as.data.table(tss_data)
# Ensure the 'Gene' column is of the same type
dat_dt[, Gene := as.character(Gene)]
tss_data_dt[, Gene := as.character(Gene)]
# Remove duplicated rows in tss_data and keep only the first TSS for each gene
tss_data_unique <- tss_data_dt[!duplicated(tss_data_dt$Gene), ]
# Merge data
merged_data <- merge(dat_dt, tss_data_unique, by = "Gene", all.x = TRUE)
# Save merged data
# fwrite(merged_data, "/Users/charleenadams/temp_BI/final_merged_data.csv", sep = ",")
# ---------------------------------------------
# Example: Filtering Data for a Specific Gene (PTER)
# ---------------------------------------------
pter <- merged_data[which(merged_data$Gene == "PTER"), ]
#head(pter)
We begin by loading the harmonized NAT/BMI data and identifying the sentinel SNP for the PTER region.
# Uses the original final_merged_data.csv with first entry of duplicates retained.
# Doesn't address failures to harmonize multi-allelic variants.
# We deal with this later.
# Load the final merged data
final_merged <- fread("/Users/charleenadams/temp_BI/final_merged_data.csv")
final_merged <- setDT(final_merged)
We identify the sentinel SNP and define a 500KB window around it to extract relevant SNPs.
# Identify the SNP of interest
snp_of_interest <- final_merged %>% filter(SNP == "rs147852470")
# Extract position and chromosome
snp_position <- snp_of_interest$pos.exposure[1]
chromosome <- snp_of_interest$Chromosome[1]
# Define 500KB upstream and downstream boundaries
upstream_limit <- snp_position - 500000
downstream_limit <- snp_position + 500000
# Subset data within the 500KB window
subset_data <- final_merged %>%
filter(Chromosome == chromosome &
pos.exposure >= upstream_limit &
pos.exposure <= downstream_limit)
# Save the subset data
write.table(subset_data, "500kb_up_and_downstream_sentinel_NAT.txt",
row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
write.csv(subset_data, "500kb_up_and_downstream_sentinel_NAT.csv")
# Check the number of SNPs per gene
table(subset_data$Gene)
##
## C1QL3 CUBN MINDY3 PTER RSU1
## 193 358 1150 1738 1277
# Load summary statistics
summary_stats_file <- "500kb_up_and_downstream_sentinel_NAT.csv"
summary_stats <- fread(summary_stats_file, header = TRUE, stringsAsFactors = FALSE)
# Display dimensions
cat("Summary statistics loaded with dimensions:", dim(summary_stats), "\n")
## Summary statistics loaded with dimensions: 4716 30
We calculate the Minor Allele Frequency (MAF) and filter out SNPs with MAF ≤ 0.05.
# Calculate MAF
summary_stats <- summary_stats %>%
mutate(
maf.exposure = pmin(eaf.exposure, 1 - eaf.exposure, na.rm = TRUE),
maf.outcome = pmin(eaf.outcome, 1 - eaf.outcome, na.rm = TRUE)
) %>%
filter(maf.exposure > 0.05)
# Display dimensions after filtering
cat("After MAF filtering, dimensions:", dim(summary_stats), "\n")
## After MAF filtering, dimensions: 2245 32
wget https://hgdownload.soe.ucsc.edu/gbdb/hg38/1000Genomes/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
#!/bin/bash
# Define the directories and files
VCF_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch"
DBSNP_FILE="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/00-All.vcf.gz"
OUTPUT_DIR="$VCF_DIR/annotated"
mkdir -p "$OUTPUT_DIR"
# Get the list of chromosome VCF files only
VCF_FILES=("$VCF_DIR"/ALL.chr*.vcf.gz)
# Function to process a single VCF file
process_vcf() {
local vcf_file="$1"
echo "Processing ${vcf_file}..."
# Create index for the VCF file
echo "Creating index for ${vcf_file}..."
tabix -p vcf "$vcf_file"
# Annotate the VCF file using dbSNP
echo "Annotating ${vcf_file}..."
bcftools annotate -a "$DBSNP_FILE" -c ID "$vcf_file" -O z -o "$OUTPUT_DIR/$(basename "${vcf_file%.vcf.gz}_annotated.vcf.gz")"
echo "Annotated VCF saved as $OUTPUT_DIR/$(basename "${vcf_file%.vcf.gz}_annotated.vcf.gz")"
}
# Export the function and variables for parallel processing
export -f process_vcf
export DBSNP_FILE OUTPUT_DIR
# Use GNU Parallel to process VCF files
echo "Starting annotation in parallel..."
parallel process_vcf ::: "${VCF_FILES[@]}"
echo "All annotations completed."
# Run in background with:
nohup ./annotate_rsids.sh > annotate_rsids_log.txt 2>&1 &
The NAT summary statistics used Finnish samples.
I retrieved a 1000 Genomes annotation file with population and subpopulation linkers to select out populations: integrated_call_samples_v3.20130502.ALL.panel
#!/bin/bash
# Set the paths
ANNOTATED_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated"
OUTPUT_DIR="$ANNOTATED_DIR/fin"
PANEL_FILE="$ANNOTATED_DIR/integrated_call_samples_v3.20130502.ALL.panel"
# Create the output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Get the list of Finnish samples
FIN_SAMPLES=$(awk '$2 == "FIN" {print $1}' "$PANEL_FILE")
# Filter VCF files in parallel
for vcf_file in "$ANNOTATED_DIR"/*.vcf.gz; do
base_name=$(basename "$vcf_file" .vcf.gz)
output_file="$OUTPUT_DIR/${base_name}_FIN.vcf.gz"
# Run the filtering command
bcftools view -S <(echo "$FIN_SAMPLES") -O z -o "$output_file" "$vcf_file" &
done
# Wait for all background processes to finish
wait
echo "Filtering completed. Finnish samples are saved in $OUTPUT_DIR."
# Run in background with:
nohup ./filter_fin_population.sh > filter_fin_population_log.txt 2>&1 &
NB: Without MAF filtering, the subsequent LD matrix will be filled with NaNs.
#!/bin/bash
# ----------------------------------------------------------------------------------
# Script: create_bfiles_from_vcf.sh
# Description: Convert a VCF.gz file to PLINK binary files with specified filtering options.
# Usage: ./create_bfiles_from_vcf.sh
# ----------------------------------------------------------------------------------
# ------------------------------
# Step 1: Define File Paths
# ------------------------------
# Input VCF.gz file path
INPUT_VCF="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_FIN.vcf.gz"
# Output PLINK binary files prefix
OUTPUT_PREFIX="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/ALL.chr10.maf_filtered_fin2"
# ------------------------------
# Step 2: Check for Required Tools
# ------------------------------
# Function to check if a command exists
command_exists () {
command -v "$1" >/dev/null 2>&1 ;
}
# Check for plink2
if ! command_exists plink2 ; then
echo "Error: plink2 is not installed or not in your PATH."
echo "Please install plink2 before running this script."
exit 1
fi
# Check for tabix
if ! command_exists tabix ; then
echo "Error: tabix is not installed or not in your PATH."
echo "Please install tabix before running this script."
exit 1
fi
# ------------------------------
# Step 3: Verify Input VCF File
# ------------------------------
if [ ! -f "$INPUT_VCF" ]; then
echo "Error: Input VCF file not found at $INPUT_VCF"
exit 1
fi
# Check if VCF is compressed with gzip
if [[ "$INPUT_VCF" != *.gz ]]; then
echo "Error: Input VCF file must be gzip-compressed (.vcf.gz)"
exit 1
fi
# ------------------------------
# Step 4: Index the VCF File (If Not Already Indexed)
# ------------------------------
INDEX_FILE="${INPUT_VCF}.tbi"
if [ ! -f "$INDEX_FILE" ]; then
echo "VCF index (.tbi) not found. Creating index with tabix..."
tabix -p vcf "$INPUT_VCF"
if [ $? -ne 0 ]; then
echo "Error: Failed to index VCF file."
exit 1
fi
echo "VCF file indexed successfully."
else
echo "VCF index (.tbi) already exists."
fi
# ------------------------------
# Step 5: Run plink2 Command
# ------------------------------
echo "Starting plink2 conversion from VCF to PLINK binary files..."
plink2 \
--vcf "$INPUT_VCF" \
--keep-allele-order \
--maf 0.05 \
--make-bed \
--max-alleles 2 \
--snps-only just-acgt \
--out "$OUTPUT_PREFIX" \
--threads 4 \
--memory 16000
# ------------------------------
# Step 6: Verify Output
# ------------------------------
if [ -f "${OUTPUT_PREFIX}.bed" ] && [ -f "${OUTPUT_PREFIX}.bim" ] && [ -f "${OUTPUT_PREFIX}.fam" ]; then
echo "plink2 conversion completed successfully."
echo "Output files:"
echo "${OUTPUT_PREFIX}.bed"
echo "${OUTPUT_PREFIX}.bim"
echo "${OUTPUT_PREFIX}.fam"
else
echo "Error: plink2 failed to create all output files."
echo "Please check the error messages above for more details."
exit 1
fi
echo "Done."
# Run with:
nohup ./create_bfiles_from_vcf_fin.sh > create_bfiles_from_vcf_fin_log.txt 2>&1 &
#!/bin/bash
# Set the directory containing the FIN VCF files
vcf_dir="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin"
output_dir="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/bfiles_filtered"
# Create output directory if it doesn't exist
mkdir -p "$output_dir"
# Loop through each VCF file in the directory
for vcf_file in "$vcf_dir"/*.vcf.gz; do
# Get the base name of the VCF file (without the extension)
base_name=$(basename "$vcf_file" .vcf.gz)
# Create bfiles using plink2 with filtering
plink2 --vcf "$vcf_file" \
--maf 0.05 \
--make-bed \
--out "$output_dir/$base_name" \
--allow-extra-chr \
--keep-allele-order \
--snps-only just-acgt \
--max-alleles 2
echo "Converted and filtered $vcf_file to bfiles."
done
# Run in background with:
nohup ./create_filtered_bfiles.sh > create_filtered_bfiles_log.txt 2>&1 &
Selected 1MB up and downstream of the PTER sentinel SNP, as we use R later to narrow to the 500KB region.
#!/bin/bash
# Define input files and parameters
PLINK_PATH="/usr/local/bin/plink2"
DATA_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered"
OUTPUT_DIR="$DATA_DIR/chr10_ld_matrix"
CHR=10
START=15402141
END=17402141
# Ensure the output directory exists
mkdir -p "$OUTPUT_DIR"
# Input bfile for chromosome 10 after MAF filtering
INPUT_BFILE="$DATA_DIR/ALL.chr${CHR}.maf_filtered_fin2"
# Generate the LD matrix using PLINK 2 with the MAF-filtered data
echo "Generating LD matrix for chromosome $CHR using MAF-filtered data..."
$PLINK_PATH --bfile "$INPUT_BFILE" \
--chr "$CHR" \
--from-bp "$START" \
--to-bp "$END" \
--r-phased square \
--out "$OUTPUT_DIR/LD_matrix_chr${CHR}_${START}_${END}_maf_filtered_fin2"
# Check if the command succeeded
if [ $? -ne 0 ]; then
echo "Error during LD matrix calculation. Check the log file for details."
exit 1
fi
echo "LD matrix generation for chromosome $CHR completed successfully."
# Run in background with:
nohup ./make_fin_filtered_matrix_maf.sh > make_fin_filtered_matrix_maf_log.txt 2>&1 &
# Dataframe version
nohup ./make_fin_filtered_matrix_maf_df.sh > make_fin_filtered_matrix_maf_df_log.txt 2>&1 &
The Jurgens BMI summary statistics uses UK Biobank samples, which have a different LD pattern than the Finnish population.
#!/bin/bash
# Set the paths
ANNOTATED_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated"
OUTPUT_DIR="$ANNOTATED_DIR/eur"
PANEL_FILE="$ANNOTATED_DIR/integrated_call_samples_v3.20130502.ALL.panel"
# Create the output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Get the list of European samples and write to a file
EUR_SAMPLE_FILE="$OUTPUT_DIR/eur_samples.txt"
awk '$3 == "EUR" {print $1}' "$PANEL_FILE" > "$EUR_SAMPLE_FILE"
# Filter VCF files in parallel
for vcf_file in "$ANNOTATED_DIR"/*.vcf.gz; do
base_name=$(basename "$vcf_file" .vcf.gz)
output_file="$OUTPUT_DIR/${base_name}_EUR.vcf.gz"
# Run the filtering command
bcftools view -S "$EUR_SAMPLE_FILE" -O z -o "$output_file" "$vcf_file" &
done
# Wait for all background processes to finish
wait
echo "Filtering completed. European samples are saved in $OUTPUT_DIR."
NB: Without MAF filtering, the subsequent LD matrix will be filled with NaNs.
#!/bin/bash
# ----------------------------------------------------------------------------------
# Script: create_bfiles_from_vcf.sh
# Description: Convert a VCF.gz file to PLINK binary files with specified filtering options.
# Usage: ./create_bfiles_from_vcf.sh
# ----------------------------------------------------------------------------------
# ------------------------------
# Step 1: Define File Paths
# ------------------------------
# Input VCF.gz file path
INPUT_VCF="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_EUR.vcf.gz"
# Output PLINK binary files prefix
OUTPUT_PREFIX="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/ALL.chr10.maf_filtered_EUR"
# ------------------------------
# Step 2: Check for Required Tools
# ------------------------------
# Function to check if a command exists
command_exists () {
command -v "$1" >/dev/null 2>&1 ;
}
# Check for plink2
if ! command_exists plink2 ; then
echo "Error: plink2 is not installed or not in your PATH."
echo "Please install plink2 before running this script."
exit 1
fi
# Check for tabix
if ! command_exists tabix ; then
echo "Error: tabix is not installed or not in your PATH."
echo "Please install tabix before running this script."
exit 1
fi
# ------------------------------
# Step 3: Verify Input VCF File
# ------------------------------
if [ ! -f "$INPUT_VCF" ]; then
echo "Error: Input VCF file not found at $INPUT_VCF"
exit 1
fi
# Check if VCF is compressed with gzip
if [[ "$INPUT_VCF" != *.gz ]]; then
echo "Error: Input VCF file must be gzip-compressed (.vcf.gz)"
exit 1
fi
# ------------------------------
# Step 4: Index the VCF File (If Not Already Indexed)
# ------------------------------
INDEX_FILE="${INPUT_VCF}.tbi"
if [ ! -f "$INDEX_FILE" ]; then
echo "VCF index (.tbi) not found. Creating index with tabix..."
tabix -p vcf "$INPUT_VCF"
if [ $? -ne 0 ]; then
echo "Error: Failed to index VCF file."
exit 1
fi
echo "VCF file indexed successfully."
else
echo "VCF index (.tbi) already exists."
fi
# ------------------------------
# Step 5: Run plink2 Command
# ------------------------------
echo "Starting plink2 conversion from VCF to PLINK binary files..."
plink2 \
--vcf "$INPUT_VCF" \
--keep-allele-order \
--maf 0.05 \
--make-bed \
--max-alleles 2 \
--snps-only just-acgt \
--out "$OUTPUT_PREFIX" \
--threads 4 \
--memory 16000
# ------------------------------
# Step 6: Verify Output
# ------------------------------
if [ -f "${OUTPUT_PREFIX}.bed" ] && [ -f "${OUTPUT_PREFIX}.bim" ] && [ -f "${OUTPUT_PREFIX}.fam" ]; then
echo "plink2 conversion completed successfully."
echo "Output files:"
echo "${OUTPUT_PREFIX}.bed"
echo "${OUTPUT_PREFIX}.bim"
echo "${OUTPUT_PREFIX}.fam"
else
echo "Error: plink2 failed to create all output files."
echo "Please check the error messages above for more details."
exit 1
fi
echo "Done."
#!/bin/bash
# Define input files and parameters
PLINK_PATH="/usr/local/bin/plink2"
DATA_DIR="/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered"
OUTPUT_DIR="$DATA_DIR/chr10_ld_matrix"
CHR=10
START=15402141
END=17402141
# Ensure the output directory exists
mkdir -p "$OUTPUT_DIR"
# Input bfile for chromosome 10 after MAF filtering
INPUT_BFILE="$DATA_DIR/ALL.chr${CHR}.maf_filtered_EUR"
# Generate the LD matrix using PLINK 2 with the MAF-filtered data
echo "Generating LD matrix for chromosome $CHR using MAF-filtered data..."
$PLINK_PATH --bfile "$INPUT_BFILE" \
--chr "$CHR" \
--from-bp "$START" \
--to-bp "$END" \
--r-phased square \
--out "$OUTPUT_DIR/LD_matrix_chr${CHR}_${START}_${END}_maf_filtered"
# Check if the command succeeded
if [ $? -ne 0 ]; then
echo "Error during LD matrix calculation. Check the log file for details."
exit 1
fi
echo "LD matrix generation for chromosome $CHR completed successfully."
# Run with:
nohup ./make_eur_filtered_matrix_maf.sh > make_eur_filtered_matrix_maf_log.txt 2>&1 &
# Dataframe version:
nohup ./make_eur_filtered_df_maf.sh > make_eur_filtered_df_maf_log.txt 2>&1 &
We load the LD matrices and SNP lists for both traits.
NB: Loaded below are the square LD matrices made
with the --r-phased square flag. I also produced are the
matrcies made with --r-phased flag. Those matrices contain
fewer SNPs, which Grok suggests is due to the square matrices maybe
including some “monomorphic” SNPs? I load the dataframe matrices later
in the script, as they also contain explicit info for which allele plink
used as the “major” (MAJ_A) allele.
# LD matrix and SNP list for Trait 1 (Exposure)
ld_matrix_file1 <- "LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_fin2.phased.vcor1"
snp_list_file1 <- "LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_fin2.phased.vcor1.vars"
# Load LD matrix and SNP list for Trait 1
ld_matrix1 <- as.matrix(read.table(ld_matrix_file1, header = FALSE, stringsAsFactors = FALSE))
snp_list1 <- read.table(snp_list_file1, header = FALSE, stringsAsFactors = FALSE)$V1
rownames(ld_matrix1) <- colnames(ld_matrix1) <- snp_list1
# Ensure LD matrix is symmetric
if (!isSymmetric(ld_matrix1)) {
ld_matrix1 <- (ld_matrix1 + t(ld_matrix1)) / 2
}
# LD matrix and SNP list for Trait 2 (Outcome)
ld_matrix_file2 <- "LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_EUR2.phased.vcor1"
snp_list_file2 <- "LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_EUR2.phased.vcor1.vars"
# Load LD matrix and SNP list for Trait 2
ld_matrix2 <- as.matrix(read.table(ld_matrix_file2, header = FALSE, stringsAsFactors = FALSE))
snp_list2 <- read.table(snp_list_file2, header = FALSE, stringsAsFactors = FALSE)$V1
rownames(ld_matrix2) <- colnames(ld_matrix2) <- snp_list2
# Ensure LD matrix is symmetric
if (!isSymmetric(ld_matrix2)) {
ld_matrix2 <- (ld_matrix2 + t(ld_matrix2)) / 2
}
NB: Since the LD matrices are different (thought not hugely) for FIN and EUR populations, I selected only the snps that were common to both and our merged summary stats.
# Find intersecting SNPs
snps_trait1 <- intersect(summary_stats$SNP, snp_list1)
snps_trait2 <- intersect(summary_stats$SNP, snp_list2)
# Number of SNPs after matching
cat("Number of SNPs after matching with LD matrices:\n")
## Number of SNPs after matching with LD matrices:
cat("Trait 1:", length(snps_trait1), "\n")
## Trait 1: 2055
cat("Trait 2:", length(snps_trait2), "\n")
## Trait 2: 1967
# Find common SNPs between both traits
common_snps <- intersect(snps_trait1, snps_trait2)
cat("Number of common SNPs between both traits:", length(common_snps), "\n")
## Number of common SNPs between both traits: 1910
# Subset summary statistics
summary_stats_filtered <- summary_stats %>%
filter(SNP %in% common_snps) %>%
arrange(match(SNP, common_snps))
# Subset LD matrices
ld_matrix1_filtered <- ld_matrix1[common_snps, common_snps]
ld_matrix2_filtered <- ld_matrix2[common_snps, common_snps]
# Verify dimensions
cat("After subsetting to common SNPs, dimensions:\n")
## After subsetting to common SNPs, dimensions:
cat("Summary Statistics:", dim(summary_stats_filtered), "\n")
## Summary Statistics: 1910 32
cat("LD Matrix Trait 1:", dim(ld_matrix1_filtered), "\n")
## LD Matrix Trait 1: 1910 1910
cat("LD Matrix Trait 2:", dim(ld_matrix2_filtered), "\n")
## LD Matrix Trait 2: 1910 1910
# Extract SNP order from the summary statistics
summary_snps <- summary_stats_filtered$SNP
# Extract SNP order from the LD matrices
ld_snps1 <- rownames(ld_matrix1_filtered)
ld_snps2 <- rownames(ld_matrix2_filtered)
# Check if the orders match
order_match_trait1 <- identical(summary_snps, ld_snps1)
order_match_trait2 <- identical(summary_snps, ld_snps2)
# Output the order match results
cat("SNP order matches between summary stats and LD matrix 1:", order_match_trait1, "\n")
## SNP order matches between summary stats and LD matrix 1: TRUE
cat("SNP order matches between summary stats and LD matrix 2:", order_match_trait2, "\n")
## SNP order matches between summary stats and LD matrix 2: TRUE
# If orders don't match, display mismatched SNPs
if (!order_match_trait1) {
cat("SNPs in summary_stats_filtered but not in ld_matrix1_filtered:\n")
print(setdiff(summary_snps, ld_snps1))
cat("SNPs in ld_matrix1_filtered but not in summary_stats_filtered:\n")
print(setdiff(ld_snps1, summary_snps))
}
if (!order_match_trait2) {
cat("SNPs in summary_stats_filtered but not in ld_matrix2_filtered:\n")
print(setdiff(summary_snps, ld_snps2))
cat("SNPs in ld_matrix2_filtered but not in summary_stats_filtered:\n")
print(setdiff(ld_snps2, summary_snps))
}
# # Reorder LD matrices to match the order of summary_snps (if needed)
# if (!order_match_trait1) {
# cat("Reordering LD matrix 1 to match summary statistics order...\n")
# ld_matrix1_filtered <- ld_matrix1_filtered[summary_snps, summary_snps]
# }
#
# if (!order_match_trait2) {
# cat("Reordering LD matrix 2 to match summary statistics order...\n")
# ld_matrix2_filtered <- ld_matrix2_filtered[summary_snps, summary_snps]
# }
# Final verification after reordering
cat("Final dimensions of filtered data:\n")
## Final dimensions of filtered data:
cat("Summary Statistics:", dim(summary_stats_filtered), "\n")
## Summary Statistics: 1910 32
cat("LD Matrix Trait 1:", dim(ld_matrix1_filtered), "\n")
## LD Matrix Trait 1: 1910 1910
cat("LD Matrix Trait 2:", dim(ld_matrix2_filtered), "\n")
## LD Matrix Trait 2: 1910 1910
cat("Summary statistics and LD matrices are now aligned in terms of number of snps included and their order.\n")
## Summary statistics and LD matrices are now aligned in terms of number of snps included and their order.
# Define file paths
fin_ld_df2 <- "/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/fin/bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_fin2_df.vcor"
eur_ld_df2 <- "/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/chr10_ld_matrix/LD_matrix_chr10_15402141_17402141_maf_filtered_EUR2_df.vcor"
# Read the LD data
fin_ld_df2 <- fread(fin_ld_df2)
eur_ld_df2 <- fread(eur_ld_df2)
# Ensure data.table is loaded as data.frame for compatibility with dplyr functions
setDT(summary_stats_filtered)
setDT(fin_ld_df2)
setDT(eur_ld_df2)
#head(summary_stats_filtered)
# Remove duplicates in fin_ld_df2 based on ID_A, keeping the first occurrence
fin_ld_df2_unique <- fin_ld_df2 %>%
distinct(ID_A, .keep_all = TRUE)
dim(fin_ld_df2)
## [1] 296185 9
dim(fin_ld_df2_unique)
## [1] 4932 9
#head(fin_ld_df2_unique)
# Step 1: Remove duplicates in fin_ld_df2 based on ID_A, keeping the first occurrence
eur_ld_df2_unique <- eur_ld_df2 %>%
distinct(ID_A, .keep_all = TRUE)
dim(eur_ld_df2)
## [1] 252624 9
dim(eur_ld_df2_unique)
## [1] 4658 9
#head(eur_ld_df2_unique)
# Merge with summary_stats_filtered on SNP and ID_A
plink_merged_data <- summary_stats_filtered %>%
inner_join(fin_ld_df2_unique, by = c("SNP" = "ID_A"))
dim(summary_stats_filtered)
## [1] 1910 32
dim(plink_merged_data)
## [1] 1876 40
# Merge plink_merged_data with eur_ld_df2_unique by ID_A
# Use the `suffix` parameter to differentiate columns from each dataset
final_merged_plink_data <- plink_merged_data %>%
inner_join(eur_ld_df2_unique, by = c("SNP" = "ID_A"), suffix = c("_fin", "_eur"))
# View the dimensions of the resulting merged data
dim(final_merged_plink_data)
## [1] 1859 48
# Display the first few rows to verify the merge
#head(final_merged_plink_data)
# Check if MAJ_A_eur and MAJ_A_fin are the same
identical_maj_alleles <- all(final_merged_plink_data$MAJ_A_eur == final_merged_plink_data$MAJ_A_fin)
# Print the result
if (identical_maj_alleles) {
cat("MAJ_A_eur and MAJ_A_fin are identical for all SNPs.\n")
} else {
cat("MAJ_A_eur and MAJ_A_fin differ for some SNPs.\n")
}
## MAJ_A_eur and MAJ_A_fin differ for some SNPs.
# Optionally, get the rows where they differ
mismatched_alleles <- final_merged_plink_data[final_merged_plink_data$MAJ_A_eur != final_merged_plink_data$MAJ_A_fin, ]
dim(mismatched_alleles)
## [1] 46 48
# View the first few rows of mismatched alleles if any
head(mismatched_alleles)
## V1 Gene SNP chr.exposure chr.outcome pos.exposure pos.outcome
## <int> <char> <char> <int> <int> <int> <int>
## 1: 59 C1QL3 rs12412368 10 10 16543523 16543523
## 2: 100 C1QL3 rs4748302 10 10 16513863 16513863
## 3: 129 C1QL3 rs6602130 10 10 16516291 16516291
## 4: 145 C1QL3 rs7084722 10 10 16553397 16553397
## 5: 182 C1QL3 rs7909832 10 10 16514711 16514711
## 6: 330 CUBN rs1797081 10 10 16832566 16832566
## effect_allele.exposure other_allele.exposure effect_allele.outcome
## <char> <char> <char>
## 1: C T C
## 2: A T A
## 3: T C T
## 4: G A G
## 5: C T C
## 6: C T C
## other_allele.outcome beta.exposure beta.outcome eaf.exposure eaf.outcome
## <char> <num> <num> <num> <num>
## 1: T -0.094 0.003897880 0.36 0.532614
## 2: T -0.120 -0.002748440 0.35 0.431003
## 3: C -0.120 0.002841890 0.35 0.556221
## 4: A 0.110 -0.004470120 0.37 0.442371
## 5: T -0.120 0.002827650 0.36 0.568412
## 6: T 0.045 0.000515129 0.42 0.497911
## se.exposure se.outcome exposure id.exposure outcome id.outcome
## <num> <num> <char> <char> <char> <char>
## 1: 0.019 0.00197099 N-acetyltaurine N-acetyltaurine BMI BMI
## 2: 0.019 0.00190378 N-acetyltaurine N-acetyltaurine BMI BMI
## 3: 0.019 0.00189574 N-acetyltaurine N-acetyltaurine BMI BMI
## 4: 0.019 0.00199622 N-acetyltaurine N-acetyltaurine BMI BMI
## 5: 0.019 0.00190602 N-acetyltaurine N-acetyltaurine BMI BMI
## 6: 0.018 0.00191158 N-acetyltaurine N-acetyltaurine BMI BMI
## pval.exposure pval.outcome samplesize.exposure samplesize.outcome id_col
## <num> <num> <int> <int> <char>
## 1: 7.5e-07 0.0479711 6099 460000 C1QL3
## 2: 1.1e-10 0.1488320 6099 460000 C1QL3
## 3: 8.8e-11 0.1338490 6099 460000 C1QL3
## 4: 2.9e-09 0.0251368 6099 460000 C1QL3
## 5: 9.3e-11 0.1379320 6099 460000 C1QL3
## 6: 1.4e-02 0.7875620 6099 460000 CUBN
## mr_keep Ensembl_ID Chromosome TSS maf.exposure maf.outcome
## <lgcl> <char> <int> <int> <num> <num>
## 1: TRUE ENSG00000165985 10 16521879 0.36 0.467386
## 2: FALSE ENSG00000165985 10 16521879 0.35 0.431003
## 3: TRUE ENSG00000165985 10 16521879 0.35 0.443779
## 4: TRUE ENSG00000165985 10 16521879 0.37 0.442371
## 5: TRUE ENSG00000165985 10 16521879 0.36 0.431588
## 6: TRUE ENSG00000107611 10 17129811 0.42 0.497911
## #CHROM_A_fin POS_A_fin MAJ_A_fin CHROM_B_fin POS_B_fin ID_B_fin MAJ_B_fin
## <int> <int> <char> <int> <int> <char> <char>
## 1: 10 16543523 T 10 16546145 rs8181375 G
## 2: 10 16513863 T 10 16514711 rs7909832 T
## 3: 10 16516291 C 10 16516431 rs2298125 T
## 4: 10 16553397 G 10 16555959 rs6602134 G
## 5: 10 16514711 T 10 16515561 rs7905486 G
## 6: 10 16832566 C 10 16832632 rs10904823 G
## PHASED_R_fin #CHROM_A_eur POS_A_eur MAJ_A_eur CHROM_B_eur POS_B_eur
## <num> <int> <int> <char> <int> <int>
## 1: 0.914796 10 16543523 C 10 16546145
## 2: 1.000000 10 16513863 A 10 16514711
## 3: 0.661258 10 16516291 T 10 16516431
## 4: 0.607487 10 16553397 A 10 16555959
## 5: 0.777368 10 16514711 C 10 16515561
## 6: -0.450745 10 16832566 T 10 16833163
## ID_B_eur MAJ_B_eur PHASED_R_eur
## <char> <char> <num>
## 1: rs8181375 G -0.820342
## 2: rs7909832 C 0.993975
## 3: rs2298125 T -0.523863
## 4: rs6602134 G -0.555018
## 5: rs7905486 G -0.614060
## 6: rs4088455 A 0.462001
# Define the "not in" operator
`%notin%` <- function(x, y) {
!(x %in% y)
}
final_merged_plink_data_ld_matricesQC <- final_merged_plink_data[which(final_merged_plink_data$SNP%notin%mismatched_alleles$SNP),]
dim(final_merged_plink_data_ld_matricesQC)
## [1] 1813 48
dim(final_merged_plink_data)
## [1] 1859 48
identical_maj_alleles <- all(final_merged_plink_data_ld_matricesQC$MAJ_A_eur == final_merged_plink_data_ld_matricesQC$MAJ_A_fin)
# Print the result
if (identical_maj_alleles) {
cat("MAJ_A_eur and MAJ_A_fin are identical for all SNPs.\n")
} else {
cat("MAJ_A_eur and MAJ_A_fin differ for some SNPs.\n")
}
## MAJ_A_eur and MAJ_A_fin are identical for all SNPs.
# Optionally, get the rows where they differ
mismatched_alleles_fin_MAJ_A <- final_merged_plink_data_ld_matricesQC[final_merged_plink_data_ld_matricesQC$other_allele.exposure != final_merged_plink_data_ld_matricesQC$MAJ_A_fin, ]
#head(mismatched_alleles_fin_MAJ_A)
dim(mismatched_alleles_fin_MAJ_A)
## [1] 579 48
# Create a column to indicate mismatched alleles
final_merged_plink_data_ld_matricesQC <- final_merged_plink_data_ld_matricesQC %>%
mutate(
mismatch_flag_MAJ_A_fin_other_allele.exposure = other_allele.exposure != MAJ_A_fin
)
# Identify palindromes
final_merged_plink_data_ld_matricesQC <- final_merged_plink_data_ld_matricesQC %>%
mutate(
mismatch_flag = other_allele.exposure != MAJ_A_fin,
palindromic = (effect_allele.exposure %in% c("A", "T") & other_allele.exposure %in% c("A", "T")) |
(effect_allele.exposure %in% c("C", "G") & other_allele.exposure %in% c("C", "G"))
)
# My bespoke harmonizing didn't work, which may be because the SNPs that differ are multi-allelic. I removed duplicates when harmonizing the original NAT and BMI summary statistics. So, if the 1000 Genomes data contained multiple entries for SNPs with multiple alleles, that could very likely explain why attempting to harmonize the LD data to the summary stats doesn't work...
# final_merged_plink_data_harmonized <- final_merged_plink_data %>%
# mutate(
# # Flip beta for all mismatches
# beta.exposure = ifelse(mismatch_flag, -beta.exposure, beta.exposure),
#
# # For non-palindromic SNPs, swap the alleles
# effect_allele.exposure = ifelse(mismatch_flag & !palindromic, other_allele.exposure, effect_allele.exposure),
# other_allele.exposure = ifelse(mismatch_flag & !palindromic, effect_allele.exposure, other_allele.exposure),
#
# # For palindromic SNPs, we need to decide based on frequency or strand information
# # Here, we'll just flip if the frequencies suggest it's necessary, but this might need more nuanced handling
# # If we know we should flip based on external information or frequency:
# flip_palindromic = ifelse(palindromic & mismatch_flag &
# (eaf.exposure > 0.5 & (1 - eaf.outcome) > 0.5 |
# eaf.exposure < 0.5 & eaf.outcome < 0.5),
# TRUE, FALSE),
# beta.exposure = ifelse(flip_palindromic, -beta.exposure, beta.exposure)
# ) %>%
# select(-mismatch_flag, -palindromic, -flip_palindromic) # Clean up temporary columns
#
# # Step 3: Check for remaining mismatches: all still there; didn't work
# remaining_mismatches <- final_merged_plink_data_harmonized %>%
# filter(other_allele.exposure != MAJ_A_fin)
# dim(remaining_mismatches)
# Change strategy: remove mismatches and palindromes...
no_mismatches=final_merged_plink_data_ld_matricesQC[which(final_merged_plink_data_ld_matricesQC$mismatch_flag_MAJ_A_fin_other_allele.exposure==FALSE),]
dim(no_mismatches)
## [1] 1234 51
table(no_mismatches$palindromic)
##
## FALSE TRUE
## 1039 195
no_mismatches_no_palindromes=no_mismatches[which(no_mismatches$palindromic=="FALSE"),]
dim(no_mismatches_no_palindromes)
## [1] 1039 51
write.table(no_mismatches_no_palindromes, file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/no_mismatches_no_palindromes.txt",
sep = "\t", quote = FALSE, row.names = FALSE)
We prepare the datasets required for the SuSiE and colocalization analyses.
# Extract the SNPs from no_mismatches_no_palindromes
snps_to_include2 <- no_mismatches_no_palindromes$SNP
# Subset ld_matrix1_filtered to include only the SNPs in snps_to_include
# Also ensure the order of the rows and columns follows the order in snps_to_include
ld_matrix1_filtered2 <- ld_matrix1_filtered[snps_to_include2, snps_to_include2]
# Subset ld_matrix2_filtered to include only the SNPs in snps_to_include
# Also ensure the order of the rows and columns follows the order in snps_to_include
ld_matrix2_filtered2 <- ld_matrix2_filtered[snps_to_include2, snps_to_include2]
# Verify the dimensions of the matrices to ensure they match the number of SNPs
dim(ld_matrix1_filtered2)
## [1] 1039 1039
dim(ld_matrix2_filtered2)
## [1] 1039 1039
# Dataset for Trait 1 (Exposure)
dataset1QC <- list(
beta = no_mismatches_no_palindromes$beta.exposure,
varbeta = (no_mismatches_no_palindromes$se.exposure)^2,
type = "quant",
snp = no_mismatches_no_palindromes$SNP,
position = no_mismatches_no_palindromes$pos.exposure,
N = no_mismatches_no_palindromes$samplesize.exposure[1],
MAF = no_mismatches_no_palindromes$maf.exposure,
LD = ld_matrix1_filtered2,
sdY = 1
)
# Dataset for Trait 2 (Outcome)
dataset2QC <- list(
beta = no_mismatches_no_palindromes$beta.outcome,
varbeta = (no_mismatches_no_palindromes$se.outcome)^2,
type = "quant",
snp = no_mismatches_no_palindromes$SNP,
position = no_mismatches_no_palindromes$pos.outcome,
N = no_mismatches_no_palindromes$samplesize.outcome[1],
MAF = no_mismatches_no_palindromes$maf.outcome,
LD = ld_matrix2_filtered2,
sdY = 1
)
check_alignment(dataset1QC)
## [1] 0.6468739
cat("Running SuSiE for Trait 1 (Exposure)...\n")
## Running SuSiE for Trait 1 (Exposure)...
susie_trait1QC <- susie_rss(
z = dataset1QC$beta / sqrt(dataset1QC$varbeta),
R = dataset1QC$LD,
n = dataset1QC$N,
var_y = dataset1QC$sdY^2,
L = 10,
verbose = FALSE
)
cat("SuSiE run for Trait 1 completed.\n")
## SuSiE run for Trait 1 completed.
summary(susie_trait1QC)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 434 1.00000000 3
## 100 1.00000000 6
## 264 0.99992987 4
## 33 0.99864237 5
## 415 0.97212036 10
## 289 0.59193358 7
## 882 0.45489551 9
## 901 0.25180964 9
## 864 0.23183661 9
## 495 0.22192210 7
## 704 0.21833030 8
## 705 0.21833030 8
## 754 0.21833030 8
## 23 0.21162797 2
## 317 0.16140812 7
## 924 0.16104929 8
## 402 0.14207528 1
## 459 0.14207528 1
## 464 0.14207528 1
## 615 0.14207528 1
## 617 0.14207528 1
## 633 0.14207528 1
## 671 0.14207528 1
## 25 0.13079782 2
## 26 0.13079782 2
## 270 0.13079782 2
## 271 0.13079782 2
## 398 0.13079782 2
## 621 0.13079782 2
## 706 0.11891905 8
## 896 0.06503996 8
## 793 0.06145816 9
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2 variable
## 1 34.44827 1.0000000 1.0000000 402,459,464,615,617,633,671
## 2 48.62429 1.0000000 1.0000000 23,25,26,270,271,398,621
## 3 52.50596 1.0000000 1.0000000 434
## 4 16.97373 1.0000000 1.0000000 264
## 5 43.24439 1.0000000 1.0000000 33
## 6 20.99722 1.0000000 1.0000000 100
## 8 16.26745 1.0000000 1.0000000 704,705,706,754,896,924
## 10 4.53025 1.0000000 1.0000000 415
## 7 33.32946 0.9848154 0.9772667 289,317,495
## 9 10.99161 0.9762863 0.9528571 793,864,882,901
susie_trait1_runsusieQC <- runsusie(dataset1QC)
summary(susie_trait1_runsusieQC)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 434 1.00000000 3
## 100 1.00000000 6
## 264 0.99992987 4
## 33 0.99864237 5
## 415 0.97212036 10
## 289 0.59193358 7
## 882 0.45489551 9
## 901 0.25180964 9
## 864 0.23183661 9
## 495 0.22192210 7
## 704 0.21833030 8
## 705 0.21833030 8
## 754 0.21833030 8
## 23 0.21162797 2
## 317 0.16140812 7
## 924 0.16104929 8
## 402 0.14207528 1
## 459 0.14207528 1
## 464 0.14207528 1
## 615 0.14207528 1
## 617 0.14207528 1
## 633 0.14207528 1
## 671 0.14207528 1
## 25 0.13079782 2
## 26 0.13079782 2
## 270 0.13079782 2
## 271 0.13079782 2
## 398 0.13079782 2
## 621 0.13079782 2
## 706 0.11891905 8
## 896 0.06503996 8
## 793 0.06145816 9
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2 variable
## 1 34.44827 1.0000000 1.0000000 402,459,464,615,617,633,671
## 2 48.62429 1.0000000 1.0000000 23,25,26,270,271,398,621
## 3 52.50596 1.0000000 1.0000000 434
## 4 16.97373 1.0000000 1.0000000 264
## 5 43.24439 1.0000000 1.0000000 33
## 6 20.99722 1.0000000 1.0000000 100
## 8 16.26745 1.0000000 1.0000000 704,705,706,754,896,924
## 10 4.53025 1.0000000 1.0000000 415
## 7 33.32946 0.9848154 0.9772667 289,317,495
## 9 10.99161 0.9762863 0.9528571 793,864,882,901
#str(susie_trait1QC)
# Extract the variables data frame from the SuSiE object
variables_df <- susie_trait1_runsusieQC$variables
if(is.null(variables_df)) {
# This path assumes 'variables' is within the 'sets' list, adjust if necessary
variables_df <- data.frame(
variable = unlist(lapply(susie_trait1_runsusieQC$sets$cs, function(x) x)),
cs = rep(names(susie_trait1_runsusieQC$sets$cs), times = lengths(susie_trait1_runsusieQC$sets$cs)),
variable_prob = susie_trait1_runsusieQC$pip[unlist(susie_trait1_runsusieQC$sets$cs)]
)
}
# Sum 'variable_prob' by 'cs'
sum_by_cs <- aggregate(variable_prob ~ cs, data = variables_df, FUN = sum)
# Print or further use 'sum_by_cs'
print(sum_by_cs)
## cs variable_prob
## 1 L1 0.9945270
## 2 L10 0.9721204
## 3 L2 0.9964149
## 4 L3 1.0000000
## 5 L4 0.9999299
## 6 L5 0.9986424
## 7 L6 1.0000000
## 8 L7 0.9752638
## 9 L8 0.9999992
## 10 L9 0.9999999
# Optionally, if you want to see which CS has the highest summed probability:
# print(sum_by_cs[which.max(sum_by_cs$variable_prob), ])
# Extract the variable strings for each credible set
susie_variable_strings_nat <- summary(susie_trait1QC)$cs$variable
susie_variable_strings_nat
## [1] "402,459,464,615,617,633,671" "23,25,26,270,271,398,621"
## [3] "434" "264"
## [5] "33" "100"
## [7] "704,705,706,754,896,924" "415"
## [9] "289,317,495" "793,864,882,901"
# Split the strings by commas and convert them into a numeric vector
susie_variable_indices_nat <- unlist(strsplit(susie_variable_strings_nat , ",")) %>% as.numeric()
susie_variable_indices_nat
## [1] 402 459 464 615 617 633 671 23 25 26 270 271 398 621 434 264 33 100 704
## [20] 705 706 754 896 924 415 289 317 495 793 864 882 901
# Extract the corresponding SNPs from the dataset using the indices
selected_snps_nat <- dataset1QC[["snp"]][susie_variable_indices_nat]
# Print the selected SNPs
print(selected_snps_nat)
## [1] "rs116982412" "rs17138607" "rs17319738" "rs7478403" "rs75016790"
## [6] "rs77547080" "rs80107016" "rs116849951" "rs12217763" "rs12217793"
## [11] "rs1052708" "rs1052709" "rs11599219" "rs75591548" "rs1334593"
## [16] "rs1023275" "rs45438202" "rs4747286" "rs10904798" "rs10904800"
## [21] "rs10904801" "rs11254170" "rs56077698" "rs6602152" "rs12253643"
## [26] "rs10795404" "rs10904760" "rs34743968" "rs12414052" "rs3886147"
## [31] "rs4748310" "rs56366628"
see_snps_in_data_nat=no_mismatches_no_palindromes[which(no_mismatches_no_palindromes$SNP%in%selected_snps_nat),]
#head(see_snps_in_data_nat)
table(see_snps_in_data_nat$Gene)
##
## C1QL3 CUBN PTER RSU1
## 4 1 17 10
# Subset the LD matrix to include only SNPs in all credible sets
ld_matrix_selected_snps_nat <- ld_matrix1_filtered2[selected_snps_nat, selected_snps_nat]
str(ld_matrix_selected_snps_nat)
## num [1:32, 1:32] 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:32] "rs116982412" "rs17138607" "rs17319738" "rs7478403" ...
## ..$ : chr [1:32] "rs116982412" "rs17138607" "rs17319738" "rs7478403" ...
# Visualize the correlation matrix using corrplot for selected SNPs
corrplot(ld_matrix_selected_snps_nat, method = "color", tl.cex = 0.6, tl.col = "black", is.corr = TRUE)
mtext("LD between SNPs in all credible sets for PTER-NAT", side = 3, line = 5, cex = 1.2)
check_alignment(dataset2QC)
## [1] 0.7886757
cat("Running SuSiE for Trait 2 (Exposure)...\n")
## Running SuSiE for Trait 2 (Exposure)...
susie_trait2QC <- susie_rss(
z = dataset2QC$beta / sqrt(dataset2QC$varbeta),
R = dataset2QC$LD,
n = dataset2QC$N,
var_y = dataset2QC$sdY^2,
L = 10,
verbose = FALSE
)
cat("SuSiE run for Trait 2 completed.\n")
## SuSiE run for Trait 2 completed.
summary(susie_trait2QC)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 505 0.895633002 1
## 1013 0.441353438 2
## 961 0.296927609 2
## 829 0.076369709 2
## 512 0.053976420 1
## 695 0.048590964 2
## 286 0.033794794 1
## 881 0.031710212 2
## 828 0.019662102 2
## 694 0.013135876 2
## 1024 0.011692712 2
## 836 0.010036937 2
## 885 0.009677330 2
## 744 0.008171265 2
## 967 0.007566711 2
## 939 0.007397615 2
## 974 0.006877593 2
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2
## 1 2.931517 0.8552187 0.7919149
## 2 1.945980 0.6929520 0.5315649
## variable
## 286,505,512
## 694,695,744,828,829,836,881,885,939,961,967,974,1013,1024
susie_trait2_runsusieQC <- runsusie(dataset2QC)
summary(susie_trait2_runsusieQC)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 505 0.895633002 1
## 1013 0.441353438 2
## 961 0.296927609 2
## 829 0.076369709 2
## 512 0.053976420 1
## 695 0.048590964 2
## 286 0.033794794 1
## 881 0.031710212 2
## 828 0.019662102 2
## 694 0.013135876 2
## 1024 0.011692712 2
## 836 0.010036937 2
## 885 0.009677330 2
## 744 0.008171265 2
## 967 0.007566711 2
## 939 0.007397615 2
## 974 0.006877593 2
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2
## 1 2.931517 0.8552187 0.7919149
## 2 1.945980 0.6929520 0.5315649
## variable
## 286,505,512
## 694,695,744,828,829,836,881,885,939,961,967,974,1013,1024
#str(susie_trait2QC)
# Extract the variables data frame from the SuSiE object
variables_df_bmi <- susie_trait2_runsusieQC$variables
if(is.null(variables_df_bmi)) {
# This path assumes 'variables' is within the 'sets' list, adjust if necessary
variables_df <- data.frame(
variable = unlist(lapply(susie_trait2_runsusieQC$sets$cs, function(x) x)),
cs = rep(names(susie_trait2_runsusieQC$sets$cs), times = lengths(susie_trait2_runsusieQC$sets$cs)),
variable_prob = susie_trait2_runsusieQC$pip[unlist(susie_trait2_runsusieQC$sets$cs)]
)
}
# Sum 'variable_prob' by 'cs'
sum_by_cs_bmi <- aggregate(variable_prob ~ cs, data = variables_df, FUN = sum)
# Print or further use 'sum_by_cs'
print(sum_by_cs_bmi)
## cs variable_prob
## 1 L1 0.9834042
## 2 L2 0.9891701
# Extract the variable strings for each credible set
susie_variable_strings_bmi <- summary(susie_trait2QC)$cs$variable
susie_variable_strings_bmi
## [1] "286,505,512"
## [2] "694,695,744,828,829,836,881,885,939,961,967,974,1013,1024"
# Split the strings by commas and convert them into a numeric vector
susie_variable_indices_bmi <- unlist(strsplit(susie_variable_strings_bmi , ",")) %>% as.numeric()
susie_variable_indices_bmi
## [1] 286 505 512 694 695 744 828 829 836 881 885 939 961 967 974
## [16] 1013 1024
# Extract the corresponding SNPs from the dataset using the indices
selected_snps_bmi <- dataset2QC[["snp"]][susie_variable_indices_bmi]
# Print the selected SNPs
print(selected_snps_bmi)
## [1] "rs10795400" "rs3898395" "rs45578234" "rs10795417" "rs10795424"
## [6] "rs11254144" "rs2253538" "rs2356378" "rs2883918" "rs4747283"
## [11] "rs4748314" "rs7069573" "rs7087428" "rs7089544" "rs7092558"
## [16] "rs7909091" "rs7916182"
see_snps_in_data_bmi=no_mismatches_no_palindromes[which(no_mismatches_no_palindromes$SNP%in%selected_snps_bmi),]
#head(see_snps_in_data_bmi)
table(see_snps_in_data_bmi$Gene)
##
## PTER RSU1
## 3 14
# Subset the LD matrix to include only SNPs in all credible sets
ld_matrix_selected_snps_bmi <- ld_matrix2_filtered2[selected_snps_bmi, selected_snps_bmi]
str(ld_matrix_selected_snps_bmi)
## num [1:17, 1:17] 1 0.9736 0.8899 0.0774 0.0364 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:17] "rs10795400" "rs3898395" "rs45578234" "rs10795417" ...
## ..$ : chr [1:17] "rs10795400" "rs3898395" "rs45578234" "rs10795417" ...
# Visualize the correlation matrix using corrplot for selected SNPs
corrplot(ld_matrix_selected_snps_bmi, method = "color", tl.cex = 0.6, tl.col = "black", is.corr = TRUE)
mtext("LD between SNPs in all credible sets for BMI", side = 3, line = 5, cex = 1.2)
cat("Running coloc.susie analysis...\n")
## Running coloc.susie analysis...
coloc_result_nat_bmi <- coloc.susie(susie_trait1QC, susie_trait2QC)
cat("coloc.susie analysis completed.\n")
## coloc.susie analysis completed.
coloc_result_nat_bmi$summary
coloc_summary <- coloc_result_nat_bmi$summary
print(coloc_summary)
## nsnps hit1 hit2 PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## <int> <char> <char> <num> <num> <num> <num>
## 1: 1039 rs116982412 rs3898395 3.790197e-36 0.01105482 3.363525e-34 0.9810192
## 2: 1039 rs116849951 rs3898395 2.546441e-50 0.01113887 2.259782e-48 0.9884935
## 3: 1039 rs1334593 rs3898395 3.327858e-54 0.01108509 2.953232e-52 0.9837105
## 4: 1039 rs1023275 rs3898395 1.139228e-18 0.01114174 1.010982e-16 0.9887480
## 5: 1039 rs45438202 rs3898395 6.108686e-45 0.01114184 5.421016e-43 0.9887576
## 6: 1039 rs4747286 rs3898395 1.078828e-22 0.01113754 9.573817e-21 0.9883746
## 7: 1039 rs10904798 rs3898395 5.792828e-18 0.01114176 5.140714e-16 0.9887501
## 8: 1039 rs12253643 rs3898395 3.161618e-06 0.01113716 2.805706e-04 0.9883420
## 9: 1039 rs10795404 rs3898395 5.021064e-35 0.01113959 4.455830e-33 0.9885568
## 10: 1039 rs4748310 rs3898395 1.093278e-12 0.01114175 9.702053e-11 0.9887495
## 11: 1039 rs116982412 rs7909091 3.365728e-35 0.09816777 3.087981e-34 0.9006651
## 12: 1039 rs116849951 rs7909091 2.239399e-49 0.09795782 2.054599e-48 0.8987345
## 13: 1039 rs1334593 rs7909091 2.947486e-53 0.09818067 2.704252e-52 0.9007837
## 14: 1039 rs1023275 rs7909091 1.003723e-17 0.09816480 9.208931e-17 0.9006377
## 15: 1039 rs45438202 rs7909091 5.381938e-44 0.09816301 4.937808e-43 0.9006213
## 16: 1039 rs4747286 rs7909091 9.446835e-22 0.09752663 8.667261e-21 0.8947697
## 17: 1039 rs10904798 rs7909091 5.103606e-17 0.09816129 4.682444e-16 0.9006055
## 18: 1039 rs12253643 rs7909091 2.784451e-05 0.09808552 2.554672e-04 0.8999093
## 19: 1039 rs10795404 rs7909091 4.400937e-34 0.09763789 4.037762e-33 0.8957927
## 20: 1039 rs4748310 rs7909091 9.631942e-12 0.09816047 8.837092e-11 0.9005979
## nsnps hit1 hit2 PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf
## PP.H4.abf idx1 idx2
## <num> <int> <int>
## 1: 0.0079260103 1 1
## 2: 0.0003675847 2 1
## 3: 0.0052044299 3 1
## 4: 0.0001102684 4 1
## 5: 0.0001005537 5 1
## 6: 0.0004878450 6 1
## 7: 0.0001081701 8 1
## 8: 0.0002370620 10 1
## 9: 0.0003036530 7 1
## 10: 0.0001087463 9 1
## 11: 0.0011671681 1 2
## 12: 0.0033076683 2 2
## 13: 0.0010356702 3 2
## 14: 0.0011974990 4 2
## 15: 0.0012156579 5 2
## 16: 0.0077036677 6 2
## 17: 0.0012332122 8 2
## 18: 0.0017218714 10 2
## 19: 0.0065693692 7 2
## 20: 0.0012416361 9 2
## PP.H4.abf idx1 idx2
# Save coloc.susie results
saveRDS(coloc_result_nat_bmi, file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/coloc_result_nat_bmi.rds")
# Extract and save PP0 to PP4
pp_values <- as.data.frame(t(coloc_result_nat_bmi$summary))
colnames(pp_values) <- c("PP0", "PP1", "PP2", "PP3", "PP4")
write.table(pp_values, file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/coloc_pp_values_coloc_result_nat_bmi.txt",
sep = "\t", quote = FALSE, row.names = FALSE)
# Display PP values
cat("PP0 to PP4 values:\n")
## PP0 to PP4 values:
print(pp_values)
## PP0 PP1 PP2 PP3 PP4
## nsnps 1039 1039 1039 1039 1039
## hit1 rs116982412 rs116849951 rs1334593 rs1023275 rs45438202
## hit2 rs3898395 rs3898395 rs3898395 rs3898395 rs3898395
## PP.H0.abf 3.790197e-36 2.546441e-50 3.327858e-54 1.139228e-18 6.108686e-45
## PP.H1.abf 0.01105482 0.01113887 0.01108509 0.01114174 0.01114184
## PP.H2.abf 3.363525e-34 2.259782e-48 2.953232e-52 1.010982e-16 5.421016e-43
## PP.H3.abf 0.9810192 0.9884935 0.9837105 0.9887480 0.9887576
## PP.H4.abf 0.0079260103 0.0003675847 0.0052044299 0.0001102684 0.0001005537
## idx1 1 2 3 4 5
## idx2 1 1 1 1 1
## NA NA NA NA NA
## nsnps 1039 1039 1039 1039 1039
## hit1 rs4747286 rs10904798 rs12253643 rs10795404 rs4748310
## hit2 rs3898395 rs3898395 rs3898395 rs3898395 rs3898395
## PP.H0.abf 1.078828e-22 5.792828e-18 3.161618e-06 5.021064e-35 1.093278e-12
## PP.H1.abf 0.01113754 0.01114176 0.01113716 0.01113959 0.01114175
## PP.H2.abf 9.573817e-21 5.140714e-16 2.805706e-04 4.455830e-33 9.702053e-11
## PP.H3.abf 0.9883746 0.9887501 0.9883420 0.9885568 0.9887495
## PP.H4.abf 0.0004878450 0.0001081701 0.0002370620 0.0003036530 0.0001087463
## idx1 6 8 10 7 9
## idx2 1 1 1 1 1
## NA NA NA NA NA
## nsnps 1039 1039 1039 1039 1039
## hit1 rs116982412 rs116849951 rs1334593 rs1023275 rs45438202
## hit2 rs7909091 rs7909091 rs7909091 rs7909091 rs7909091
## PP.H0.abf 3.365728e-35 2.239399e-49 2.947486e-53 1.003723e-17 5.381938e-44
## PP.H1.abf 0.09816777 0.09795782 0.09818067 0.09816480 0.09816301
## PP.H2.abf 3.087981e-34 2.054599e-48 2.704252e-52 9.208931e-17 4.937808e-43
## PP.H3.abf 0.9006651 0.8987345 0.9007837 0.9006377 0.9006213
## PP.H4.abf 0.0011671681 0.0033076683 0.0010356702 0.0011974990 0.0012156579
## idx1 1 2 3 4 5
## idx2 2 2 2 2 2
## NA NA NA NA NA
## nsnps 1039 1039 1039 1039 1039
## hit1 rs4747286 rs10904798 rs12253643 rs10795404 rs4748310
## hit2 rs7909091 rs7909091 rs7909091 rs7909091 rs7909091
## PP.H0.abf 9.446835e-22 5.103606e-17 2.784451e-05 4.400937e-34 9.631942e-12
## PP.H1.abf 0.09752663 0.09816129 0.09808552 0.09763789 0.09816047
## PP.H2.abf 8.667261e-21 4.682444e-16 2.554672e-04 4.037762e-33 8.837092e-11
## PP.H3.abf 0.8947697 0.9006055 0.8999093 0.8957927 0.9005979
## PP.H4.abf 0.0077036677 0.0012332122 0.0017218714 0.0065693692 0.0012416361
## idx1 6 8 10 7 9
## idx2 2 2 2 2 2
# used "summary_stats_filtered" originally
# Define genome-wide significance threshold
genomewide_sig_threshold <- -log10(5e-8)
# Prepare data for Manhattan plot
no_mismatches_no_palindromes$log_pval <- -log10(no_mismatches_no_palindromes$pval.exposure)
# Convert to data frame if necessary
if (is.data.table(no_mismatches_no_palindromes)) {
no_mismatches_no_palindromes_df <- as.data.frame(no_mismatches_no_palindromes)
} else {
no_mismatches_no_palindromes_df <- no_mismatches_no_palindromes
}
# Prepare data for labeling
snps_to_label <- unique(c(selected_snps_nat)) #, selected_snps_bmi
snps_label_data <- no_mismatches_no_palindromes_df %>%
dplyr::filter(SNP %in% snps_to_label) %>%
dplyr::select(SNP, pos.exposure, log_pval, Gene)
# Create Manhattan Plot
manhattan_plot <- ggplot(no_mismatches_no_palindromes_df, aes(x = pos.exposure, y = log_pval)) +
geom_point(aes(color = as.factor(chr.exposure)), alpha = 0.75, size = 1.3) +
scale_color_manual(values = rep(c("blue", "orange"), length(unique(no_mismatches_no_palindromes_df$chr.exposure)))) +
labs(
x = "Genomic Position (Chromosome 10)",
y = expression(-log[10](italic(p)-value)),
title = "Manhattan Plot with SuSiE Credible Set SNPs Labeled"
) +
geom_hline(
yintercept = genomewide_sig_threshold,
linetype = "dotted",
color = "red",
linewidth = 1
) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
) +
geom_text_repel(
data = snps_label_data,
aes(label = paste(SNP, Gene, sep = " - ")),
nudge_y = 1,
size = 3,
color = "black",
max.overlaps = Inf,
box.padding = 0.5,
point.padding = 0.5,
segment.color = "grey50"
)
# Display the plot
print(manhattan_plot)
# Save the plot
output_dir <- "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/figures"
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
ggsave(
filename = file.path(output_dir, "manhattan_plot_susie_snps_no_mismatches_no_palindromes_df.pdf"),
plot = manhattan_plot,
width = 10,
height = 6,
dpi = 300
)
In this pleliminary analysis, we performed fine-mapping using SuSiE for both NAT and BMI traits, identified credible sets, and conducted colocalization analysis using coloc.susie. The Manhattan plot visualizes SNPs within the region of interest and highlights those within the credible sets for NAT. The NAT data contained four credible sets containing 32 SNPs from four nearest genes: C1QL3, CUBN, PTER, and RSU1. The BMI data contained two credible sets with SNPs with PTER and RSU1 as the nearest genes. There wasn’t a highly significant hit for BMI in the studied region, though coloc suggested evidence of separate causal SNPs for NAT and BMI, supposing non-genome-wide hits for BMI are causal.
Notably, this analysis improves upon its legacy version by identifying and removing palindromic SNPs and ensuring that the directionality of all included SNPs are aligned between the four data sets: NAT, BMI, and their respective reference-panel LD matrices. The next time we do this, maybe we should harmonize the NAT and BMI data without the harmonize_data() package or consider including a way to retain multi-allelic SNPs (i.e., those that have multiple minor alleles circulating at a reasonable frequency in the population).
# Save the entire R environment
save.image(file = "fin_susie_coloc_sentinel_pter_bmi_multithreading_results/full_environment.RData")
cat("Entire R environment saved.\n")
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] biomaRt_2.61.3 readxl_1.4.3 TwoSampleMR_0.6.8
## [4] corrplot_0.94 tidyr_1.3.1 ggrepel_0.9.6
## [7] data.table_1.16.2 dplyr_1.1.4 openxlsx_4.2.7.1
## [10] RhpcBLASctl_0.23-42 ggplot2_3.5.1 susieR_0.12.35
## [13] coloc_5.2.3
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gridExtra_2.3 httr2_1.0.5
## [4] rlang_1.1.4 magrittr_2.0.3 matrixStats_1.4.1
## [7] compiler_4.4.1 RSQLite_2.3.7 systemfonts_1.1.0
## [10] png_0.1-8 vctrs_0.6.5 RcppZiggurat_0.1.6
## [13] stringr_1.5.1 pkgconfig_2.0.3 crayon_1.5.3
## [16] fastmap_1.2.0 dbplyr_2.5.0 XVector_0.45.0
## [19] labeling_0.4.3 utf8_1.2.4 rmarkdown_2.28
## [22] UCSC.utils_1.1.0 ragg_1.3.3 purrr_1.0.2
## [25] bit_4.5.0 xfun_0.47 Rfast_2.1.0
## [28] zlibbioc_1.51.1 cachem_1.1.0 GenomeInfoDb_1.41.1
## [31] jsonlite_1.8.9 progress_1.2.3 blob_1.2.4
## [34] highr_0.11 reshape_0.8.9 irlba_2.3.5.1
## [37] parallel_4.4.1 prettyunits_1.2.0 R6_2.5.1
## [40] bslib_0.8.0 stringi_1.8.4 jquerylib_0.1.4
## [43] cellranger_1.1.0 Rcpp_1.0.13 knitr_1.48
## [46] IRanges_2.39.2 Matrix_1.7-0 tidyselect_1.2.1
## [49] rstudioapi_0.16.0 yaml_2.3.10 viridis_0.6.5
## [52] curl_5.2.3 lattice_0.22-6 tibble_3.2.1
## [55] plyr_1.8.9 Biobase_2.65.1 withr_3.0.1
## [58] KEGGREST_1.45.1 evaluate_1.0.0 RcppParallel_5.1.9
## [61] BiocFileCache_2.13.0 zip_2.3.1 xml2_1.3.6
## [64] Biostrings_2.73.2 pillar_1.9.0 filelock_1.0.3
## [67] stats4_4.4.1 generics_0.1.3 S4Vectors_0.43.2
## [70] hms_1.1.3 munsell_0.5.1 scales_1.3.0
## [73] glue_1.8.0 tools_4.4.1 grid_4.4.1
## [76] AnnotationDbi_1.67.0 colorspace_2.1-1 GenomeInfoDbData_1.2.12
## [79] cli_3.6.3 rappdirs_0.3.3 textshaping_0.4.0
## [82] fansi_1.0.6 mixsqp_0.3-54 viridisLite_0.4.2
## [85] gtable_0.3.5 sass_0.4.9 digest_0.6.37
## [88] BiocGenerics_0.51.1 farver_2.1.2 memoise_2.0.1
## [91] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7
## [94] bit64_4.5.2