GenomicSEM for 40 Chemokines

1 Summary
2 Data
- 2.1 UK Biobank Pharma Proteomics Project (UKB-PPP) Proteins
3 Bioinformatic preprocessing
- 3.1 Fetching rsID and calculating p-values
4 GenomicSEM preparation
5 Determine number of factors
6 Six-factor multivariate GWAS

1 Summary

The analysis uses Genomic Structural Equation Modeling (GenomicSEM) to run a multivariate GWAS of 40 chemokines. The value of GenomicSEM lies in its ability to enhance statistical power and uncover pleiotropic effects that may remain undetected in pairwise or univariate analyses.

2 Data

2.1 UK Biobank Pharma Proteomics Project (UKB-PPP) Proteins

40 chemokine summary statistics; Europeans; both GRCh19/38
I had previously obtained these (see https://rpubs.com/YodaMendel/1243451 for an example of how to programmatically get files from UKB-PPP.

3 Bioinformatic preprocessing

3.1 Fetching rsID and calculating p-values

script: /Users/charleenadams/ukbppp/process_40_chemokines.sh

#!/bin/bash

# Directories and variables
TAR_DIR="/Users/charleenadams/ukbppp/chemokine_list"
MAP_DIR="/Users/charleenadams/temp_BI/chemokine_rgs_olink/maps"
OUTPUT_BASE_DIR="/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list"
MERGED_DIR="$OUTPUT_BASE_DIR/merged_files"
LOG_FILE="$OUTPUT_BASE_DIR/processing_log.txt"

# Create required directories
mkdir -p "$OUTPUT_BASE_DIR"
mkdir -p "$MERGED_DIR"
echo "Processing started on $(date)" > "$LOG_FILE"

# Function to process each tar file
process_tar_file() {
    local tar_file="$1"
    local base_name=$(basename "$tar_file" .tar)
    local untarred_dir="$OUTPUT_BASE_DIR/$base_name"
    local output_dir="$OUTPUT_BASE_DIR/$base_name"

    echo "Processing tar file: $tar_file" | tee -a "$LOG_FILE"

    # Untar the file into the correct directory
    mkdir -p "$untarred_dir"
    tar -xf "$tar_file" -C "$untarred_dir" || {
        echo "Failed to untar: $tar_file" | tee -a "$LOG_FILE"
        return
    }

    # Process chromosome files within the untarred directory
    mkdir -p "$output_dir"
    local processed_files=()
    find "$untarred_dir" -type f -name "*.gz" | while read -r chr_file; do
        # Skip files that are already processed
        if [[ "$chr_file" == *"_rsid_added.gz" || "$chr_file" == *"_with_pval.gz" ]]; then
            echo "Skipping already processed file: $chr_file" | tee -a "$LOG_FILE"
            continue
        fi

        local chrom=$(basename "$chr_file" | sed -E 's/.*chr([0-9XY]+).*/\1/')
        local map_file="$MAP_DIR/olink_rsid_map_mac5_info03_b0_7_chr${chrom}_patched_v2.tsv.gz"
        local output_file="${chr_file%.gz}_rsid_added.gz"

        if [[ -n "$chrom" && -f "$map_file" ]]; then
            echo "Adding RSID to $chr_file (Chromosome $chrom)" | tee -a "$LOG_FILE"
            gunzip -c "$chr_file" | \
            awk 'BEGIN {OFS="\t"}
                 NR==FNR {map[$1]=$4; genpos37[$1]=$5; genpos38[$1]=$6; next}
                 ($3 in map) {print $0, map[$3], genpos37[$3], genpos38[$3]}
                 !($3 in map) {print $0, ".", ".", "."}' <(gunzip -c "$map_file") - | \
            gzip > "$output_file"

            if [[ -s "$output_file" ]]; then
                echo "RSID added to $output_file" | tee -a "$LOG_FILE"
                processed_files+=("$output_file")

                # Add P-values if not already added
                local pval_file="${output_file%.gz}_with_pval.gz"
                if [[ ! -f "$pval_file" ]]; then
                    gunzip -c "$output_file" | \
                    awk 'BEGIN {OFS="\t"} NR==1 {print $0, "P"} NR>1 {pval = (10 ^ -$13); print $0, pval}' | \
                    gzip > "$pval_file"
                    mv "$pval_file" "$output_dir/$(basename "$pval_file")"
                else
                    echo "P-values already added for $output_file" | tee -a "$LOG_FILE"
                fi
            else
                echo "Warning: RSID addition failed for $chr_file" | tee -a "$LOG_FILE"
            fi
        else
            echo "RSID map file for chromosome $chrom not found." | tee -a "$LOG_FILE"
        fi
    done

    # Merge chromosome files in the output directory
    if [[ ${#processed_files[@]} -gt 0 ]]; then
        echo "Merging chromosome files in $output_dir" | tee -a "$LOG_FILE"
        local merged_file="$MERGED_DIR/${base_name}_merged_rsid_added.gz"

        # Extract header from the first file
        gunzip -c "${processed_files[0]}" | head -n 1 > temp_header.txt

        # Process each file, excluding the header, and append to a temporary content file
        for file in "${processed_files[@]}"; do
            gunzip -c "$file" | tail -n +2 >> temp_content.txt
        done

        # Concatenate header and content into the final merged file
        cat temp_header.txt temp_content.txt | gzip -c > "$merged_file"

        # Clean up temporary files
        rm -f temp_header.txt temp_content.txt

        if [[ -s "$merged_file" ]]; then
            echo "Merged file created: $merged_file" | tee -a "$LOG_FILE"
        else
            echo "Warning: Merged file is empty for $output_dir" | tee -a "$LOG_FILE"
        fi
    else
        echo "No files to merge in $output_dir" | tee -a "$LOG_FILE"
    fi
}

# Export the function and variables for parallel processing
export -f process_tar_file
export MAP_DIR OUTPUT_BASE_DIR LOG_FILE MERGED_DIR

# Find all tar files and process them in parallel
find "$TAR_DIR" -type f -name "*.tar" | parallel -j15 process_tar_file

echo "Processing completed on $(date)" >> "$LOG_FILE"
echo "All results saved in $OUTPUT_BASE_DIR and merged files in $MERGED_DIR"

# tmux new -s my_session # didn't use tmux; keeping command to trigger memory
# nohup ./process_40_chemokines.sh > process_40_chemokines_log 2>&1 &
# pstree 91532
# ps -p 91532 -o stat,etime
# watch -n 1 ls -lt /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list
# find /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list -type f -name "*with_pval.gz" | wc -l

# remove the extra files
main_dir="/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list"

clean_directory() {
    local dir="$1"
    find "$dir" -type f -not -name "*_rsid_added_with_pval.gz" -not -name "*.txt" -delete
    echo "Processed: $dir"
}

find "$main_dir" -type d | while read -r dir; do
    clean_directory "$dir"
done

4 GenomicSEM preparation

4.1 Munge by chromsome

library(GenomicSEM)
library(parallel)

# Set directories
base_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list"
hm3_snplist <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/w_hm3.snplist"
output_directory <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40"

# Ensure the output directory exists
if (!dir.exists(output_directory)) {
  dir.create(output_directory, recursive = TRUE)
}

# Define column names as a named list
column_names <- list(
  SNP = "rsid",        # The column for SNP identifiers
  A1 = "ALLELE1",      # The column for the first allele
  A2 = "ALLELE0",      # The column for the second allele
  effect = "BETA",     # The column for the effect size
  INFO = "INFO",       # The column for the INFO score
  P = "P",             # The column for the P-value
  N = "N",             # The column for the sample size
  MAF = "A1FREQ",      # The column for the minor allele frequency
  Z = "CHISQ"          # The column for the Z-score or chi-squared value
)

# Function to extract `N` from the first row of a file
extract_N <- function(file, column) {
  header <- read.table(gzfile(file), nrows = 1, header = TRUE)
  if (column %in% colnames(header)) {
    N <- unique(header[[column]])
    if (length(N) == 1) {
      return(as.numeric(N))
    } else {
      stop(paste("N column has multiple values in file:", file))
    }
  } else {
    stop(paste("Column", column, "not found in file:", file))
  }
}

# Function to process a single subdirectory
process_subdirectory <- function(subdir) {
  tryCatch({
    # Get subdirectory name
    subdir_name <- basename(subdir)
    
    # Create output subdirectory
    subdir_output <- file.path(output_directory, subdir_name)
    if (!dir.exists(subdir_output)) {
      dir.create(subdir_output, recursive = TRUE)
    }
    
    # Get the list of chromosome-specific files for chromosomes 1–22
    chr_files <- list.files(subdir, full.names = TRUE, pattern = "chr[1-9]|chr1[0-9]|chr2[0-2]_.*rsid_added_with_pval\\.gz$")
    
    if (length(chr_files) == 0) {
      cat(paste("No files found in subdirectory:", subdir, "\n"))
      return(NULL)
    }
    
    # Extract `N` for each file
    N_values <- sapply(chr_files, extract_N, column = column_names$N)
    
    # Define trait names based on file names
    trait_names <- gsub("_rsid_added_with_pval\\.gz$", "", basename(chr_files))
    
    # Run the munge function with internal parallelization
    munge(
      files = chr_files,
      hm3 = hm3_snplist,
      trait.names = trait_names,
      N = N_values,
      info.filter = 0.9,       # Filter by INFO score
      maf.filter = 0.01,       # Filter by MAF
      column.names = column_names,
      parallel = TRUE,         # Enable internal parallelization
      cores = detectCores() - 1,  # Use all available cores minus one
      overwrite = TRUE         # Allow overwriting of existing files
    )
    
    cat(paste("Processing completed for subdirectory:", subdir, "\n"))
  }, error = function(e) {
    cat(paste("Error processing subdirectory:", subdir, "\nError message:", e$message, "\n"))
  })
}

# Get all subdirectories
subdirs <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)

# Sequentially process subdirectories
for (subdir in subdirs) {
  process_subdirectory(subdir)
}

4.2 Move munged files to munged_40

ls -lt /Users/charleenadams/ukbppp/*.sumstats.gz | grep "Dec  8" | wc -l
# Without notes:
directory="/Users/charleenadams/ukbppp"
target_date="Dec  8"
output_dir="/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40"
mkdir -p "$output_dir"

while IFS= read -r file; do
  cp "$file" "$output_dir"
done < <(ls -lt "$directory"/*.sumstats.gz | grep "$target_date" | awk '{print $NF}')
ls -lt /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/*.sumstats.gz | grep "Dec  8" | wc -l

4.3 Merge munged files for each protein

# Run with:
cd /Users/charleenadams/ukbppp
nohup Rscript merged_munged_40.R > merged_munged_40.log 2>&1 &

library(parallel)

# Define directories
base_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40"
results_dir <- file.path(base_dir, "merged_munged_protein_results")

# Ensure the results directory exists
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}

# Function to merge chromosome files for a protein
merge_chromosome_files <- function(protein_name, chr_files, output_dir) {
  tryCatch({
    # Define output file path
    merged_file <- file.path(output_dir, paste0(protein_name, "_merged.sumstats.gz"))
    
    # Merge chromosome files
    all_data <- do.call(rbind, lapply(chr_files, function(f) {
      read.table(f, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    }))
    
    # Save the merged file
    write.table(
      all_data,
      file = gzfile(merged_file),
      quote = FALSE,
      sep = "\t",
      row.names = FALSE,
      col.names = TRUE
    )
    cat(paste("Merged protein:", protein_name, "to", merged_file, "\n"))
    return(merged_file)
  }, error = function(e) {
    cat(paste("Error merging protein:", protein_name, "\nError message:", e$message, "\n"))
    return(NULL)
  })
}

# Get list of all chromosome files
chr_files <- list.files(base_dir, pattern = "discovery_chr.*\\.sumstats\\.gz$", full.names = TRUE)

# Extract unique protein identifiers
proteins <- unique(gsub("discovery_chr[0-9]+_", "", gsub("\\.sumstats\\.gz$", "", basename(chr_files))))

# Function to process a single protein
process_protein <- function(protein) {
  protein_files <- grep(protein, chr_files, value = TRUE)
  merge_chromosome_files(protein, protein_files, output_dir = results_dir)
}

# Run merging in parallel for 12 proteins
num_cores <- detectCores() - 1  # Use all but one core
cl <- makeCluster(num_cores)
clusterExport(cl, c("chr_files", "proteins", "merge_chromosome_files", "results_dir"))
clusterEvalQ(cl, library(parallel))
merged_files <- parLapply(cl, proteins, process_protein)
stopCluster(cl)

# Print summary
cat("Merging completed for all proteins.\n")

# nohup Rscript merged_munged_40.R > merged_munged_40.log 2>&1 &

4.4 Add p-value back for generating “sumstats”

Run with: 
nohup Rscript add_p_sumstats.R > add_p_sumstats.log 2>&1 &

library(data.table)

# Define the function to add a P column
add_p_column <- function(file, output_dir) {
  tryCatch({
    data <- fread(file)
    if (!"P" %in% colnames(data)) {
      if ("Z" %in% colnames(data)) {
        # Calculate P-values from Z-scores
        data[, P := 2 * pnorm(-abs(Z))]
        
        # Create output file path
        output_file <- file.path(output_dir, basename(file))
        
        # Save the updated data to the new file
        fwrite(data, output_file, sep = "\t")
        cat("Added P column to:", output_file, "\n")
      } else {
        stop("Z column missing. Cannot calculate P-values in file:", file)
      }
    } else {
      cat("P column already exists in:", file, "\n")
    }
  }, error = function(e) {
    cat("Error processing file:", file, "\nError message:", e$message, "\n")
  })
}

# Set input and output directories
input_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results"
output_dir <- file.path(input_dir, "pvals_fix")

# Create the output directory if it doesn't exist
if (!dir.exists(output_dir)) {
  dir.create(output_dir)
}

# List all files matching the pattern in the input directory
files <- list.files(input_dir, pattern = "\\.sumstats\\.gz$", full.names = TRUE)

# Apply the function to all files
lapply(files, add_p_column, output_dir = output_dir)

4.5 Generate “sumstats” for 40 chemokines

# Run interactively in Rstudio

library(data.table)
library(GenomicSEM)

# Step 1: Define paths and variables
input_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix"  # Input directory containing sumstats files
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/summary_stats_all_40_chemokines.txt"  # Output file for all processed summary stats
ref <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/reference.1000G.maf.0.005.txt"  # Reference file for SNP variance

# Step 2: Get a list of all .sumstats.gz files in the input directory
trait_files <- list.files(input_dir, pattern = "\\.sumstats\\.gz$", full.names = TRUE)

# Check if any files were found
if (length(trait_files) == 0) {
  stop("No .sumstats.gz files found in the input directory.")
}

# Extract trait names from filenames (without .sumstats.gz extension)
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(trait_files))

cat("Processing traits:\n", paste(trait.names, collapse = "\n"), "\n")

# Step 3: Extract sample sizes (N) from each trait file
get_sample_size <- function(file) {
  data <- fread(file, select = "N", nrows = 1)  # Only read the first row to extract N
  return(data$N[1])
}

N <- sapply(trait_files, get_sample_size)
cat("Extracted sample sizes:\n", paste(N, collapse = "\n"), "\n")

# Step 4: Prepare the summary statistics with sumstats()
# Define arguments for sumstats()
se.logit <- rep(FALSE, length(trait_files))  # Continuous traits; SEs are not logistic
OLS <- rep(TRUE, length(trait_files))       # Ordinary least squares for continuous traits
linprob <- rep(FALSE, length(trait_files))  # Traits are not dichotomous; linear probability does not apply

# Run sumstats() to align and scale summary statistics
aligned_sumstats <- sumstats(
  files = trait_files,
  ref = ref,
  trait.names = trait.names,
  se.logit = se.logit,
  OLS = OLS,
  linprob = linprob,
  N = N,
  betas = NULL,
  #maf.filter = 0.01,
  keep.indel = FALSE,
  parallel = TRUE,
  cores = 16  # Adjust for available cores
)

# Save the aligned summary statistics
fwrite(aligned_sumstats, file = output_path, sep = "\t")
cat("Aligned summary statistics saved to:", output_path, "\n")

# Optional: Verify the saved file by re-loading it
summary_stats_read <- fread(output_path)
print(head(summary_stats_read))

4.6 Multivariate LDSC on 40 chemokines

# To run:
cd /Users/charleenadams/ukbppp
nohup Rscript ldsc40.R > ldsc40.log 2>&1 &

# Load required library
library(GenomicSEM)
library(data.table)
library(parallel)
library(qqman)

# Define paths and variables
traits <- list.files("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix", 
                     pattern = "\\.sumstats\\.gz$", full.names = TRUE)

# Set sample and population prevalences to NA to bypass liability scaling
sample.prev <- rep(NA, length(traits))  # Treat traits as continuous
population.prev <- rep(NA, length(traits))

ld <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/eur_w_ld_chr/"
wld <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/eur_w_ld_chr/"

# Optional: Name your traits based on filenames without extensions
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(traits))
head(trait.names)

# Redirect console output to a log file
sink("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_40_rgs.log")

# Run multivariable LDSC
LDSCoutput <- ldsc(traits = traits,
                   sample.prev = sample.prev,
                   population.prev = population.prev,
                   ld = ld,
                   wld = wld,
                   trait.names = trait.names,
                   stand = TRUE)

# Stop capturing the console output
sink()

# Save the results
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs"
saveRDS(LDSCoutput, file = file.path(output_path, "LDSCoutput40.rds"))

# Print completion message
cat("Multivariable LDSC analysis complete and save in:\n", output_path, "\n")

5 Determine number of factors

Uses parallel analysis (paLDSC) to determine number of factors to extract from a genetic correlation matrix: https://rpubs.com/JaFuente/paLDSC

library(GenomicSEM)
library(knitr)
library(dplyr)
library(kableExtra)

LDSCoutputfull <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/LDSCoutput40.rds")
#2. Genetic correlation matrix from LDSC output
S_Stand <- LDSCoutputfull$S_Stand
#3. Associated standardized multivariate sampling distribution matrix from LDSC output
V_Stand <- LDSCoutputfull$V_Stand

# Save the plot as a 600 DPI PNG
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/paLDSC_plot_40.png"
png(filename = output_path, width = 8, height = 6, units = "in", res = 600)

# Run the `paLDSC` function
paLDSC(S = S_Stand, V = V_Stand, r = 10)

## Running parallel analysis. Replication number:  1 
## Running parallel analysis. Replication number:  2 
## Running parallel analysis. Replication number:  3 
## Running parallel analysis. Replication number:  4 
## Running parallel analysis. Replication number:  5 
## Running parallel analysis. Replication number:  6 
## Running parallel analysis. Replication number:  7 
## Running parallel analysis. Replication number:  8 
## Running parallel analysis. Replication number:  9 
## Running parallel analysis. Replication number:  10

## ------------------------------------------------------------------------ 
## Parallel Analysis suggests extracting 6 components 
## ------------------------------------------------------------------------

# Close the graphics device
dev.off()

## quartz_off_screen 
##                 2

# Perform factor analysis with 6 components
fa_results <- psych::fa(S_Stand, nfactors = 6, rotate = "varimax")

# Print loadings
#print(fa_results$loadings)

# Assuming fa_results$loadings is your factor loading matrix
loadings_df <- as.data.frame(unclass(fa_results$loadings))

# Process the chemokine names
loadings_df$Protein <- rownames(loadings_df)
loadings_df$Protein <- sapply(strsplit(loadings_df$Protein, ":"), function(x) paste(x[1:1], collapse = ":"))

# Order by highest loading on first factor for clarity
ordered_loadings <- loadings_df[order(-abs(loadings_df$MR1)), ]

# Select top loadings for each factor
top_loadings <- ordered_loadings %>%
  mutate(across(MR1:MR5, ~ifelse(abs(.) > 0.3, ., NA))) %>%
  dplyr::select(Protein, everything())

# Remove row names for the final output
rownames(top_loadings) <- NULL

# Display in a nice table (you might want to adjust the threshold or how many to show)
kable(top_loadings, "html", digits = 3, 
      caption = "Top Factor Loadings for Chemokines",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "center")

Top Factor Loadings for Chemokines
Protein	MR1	MR2	MR6	MR3	MR4	MR5
CCL5	0.830	NA	NA	NA	NA	NA
CXCL3	0.779	NA	0.406	NA	NA	NA
CCL17	0.768	NA	NA	NA	NA	NA
CCL8	0.704	NA	NA	NA	NA	NA
CCL26	0.701	NA	NA	NA	NA	NA
CCL13	0.690	NA	NA	NA	NA	0.353
CXCL5	0.688	NA	0.370	NA	NA	NA
CXCL1	0.665	NA	0.442	NA	NA	NA
CCL7	0.637	0.567	0.355	NA	NA	NA
CXCL6	0.615	NA	0.452	NA	NA	NA
CCL14	0.568	NA	NA	0.469	NA	0.337
CXCL11	0.559	0.353	NA	NA	0.364	NA
CXCL12	0.503	NA	NA	NA	NA	NA
CCL28	0.478	NA	NA	NA	NA	NA
CCL22	0.475	0.552	NA	NA	NA	NA
CCL11	0.460	NA	0.329	NA	NA	0.342
CCL27	0.417	NA	NA	0.463	NA	NA
CCL4	0.405	0.468	NA	NA	NA	NA
CCL2	0.385	NA	0.309	NA	NA	0.605
CXCL8	0.368	NA	0.905	NA	NA	NA
CXCL8	0.364	NA	0.906	NA	NA	NA
CXCL8	0.358	NA	0.907	NA	NA	NA
CXCL8	0.345	NA	0.919	NA	NA	NA
CXCL13	0.327	0.637	NA	NA	0.335	-0.370
CCL3	0.310	0.597	NA	0.362	NA	NA
CCL24	NA	0.322	NA	NA	NA	NA
CCL18	NA	0.686	NA	NA	NA	NA
CCL19	NA	0.816	NA	NA	NA	NA
CCL23	NA	0.359	NA	0.653	NA	NA
CXCL10	NA	0.518	NA	NA	0.541	NA
CCL20	NA	0.717	NA	NA	NA	NA
CXCL17	NA	0.399	NA	NA	NA	NA
CCL21	NA	0.672	NA	NA	NA	NA
CXCL16	NA	NA	NA	NA	NA	0.367
CXCL9	NA	0.307	NA	NA	0.715	NA
CCL16	NA	0.534	NA	0.597	NA	NA
CCL25	NA	NA	NA	0.578	0.395	NA
CCL15	NA	NA	NA	0.730	NA	NA
CXCL14	NA	0.484	NA	NA	-0.535	NA
TAFA5	NA	NA	NA	0.582	NA	0.306

# Group them
# Load necessary libraries
library(psych)

# Assuming LDSCoutputfull is loaded
# If not, load it with:
# LDSCoutputfull <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/LDSCoutput40.rds")

# Extract standardized genetic correlation matrix
S_Stand <- LDSCoutputfull$S_Stand

# Perform factor analysis with 6 components (as suggested by parallel analysis)
fa_results <- fa(S_Stand, nfactors = 6, rotate = "varimax")

# Extract loadings
loadings <- fa_results$loadings

# Function to assign chemokines to factors based on highest loading
assign_to_factor <- function(loadings_matrix, threshold = 0.3) {
  chemokines <- rownames(loadings_matrix)
  factors <- colnames(loadings_matrix)
  assignments <- list()
  
  for(chemokine in chemokines) {
    load <- abs(loadings_matrix[chemokine, ])
    max_load <- max(load)
    if(max_load >= threshold) {
      max_factor <- factors[which.max(load)]
      if(is.null(assignments[[max_factor]])) {
        assignments[[max_factor]] <- c()
      }
      assignments[[max_factor]] <- c(assignments[[max_factor]], chemokine)
    }
  }
  return(assignments)
}

# Assign chemokines to factors with a loading threshold
chemokine_groupings <- assign_to_factor(loadings, threshold = 0.3)

# Print the groupings
#print(chemokine_groupings)

# Assuming 'chemokine_groupings' is the list with chemokines grouped by factors
chemokine_groupings <- list(
  MR1 = c("CCL11:P51671:OID20668:v1:Inflammation_merged", "CCL13:Q99616:OID20655:v1:Inflammation_merged", 
          "CCL14:Q16627:OID20401:v1:Cardiometabolic_merged", "CCL17:Q92583:OID20745:v1:Inflammation_merged", 
          "CCL26:Q9Y258:OID20546:v1:Inflammation_merged", "CCL28:Q9NRJ3:OID20569:v1:Inflammation_merged", 
          "CCL5:P13501:OID20412:v1:Cardiometabolic_merged", "CCL7:P80098:OID20523:v1:Inflammation_merged", 
          "CCL8:P80075:OID21466:v1:Oncology_merged", "CXCL1:P09341:OID20762:v1:Inflammation_merged", 
          "CXCL11:O14625:OID21042:v1:Neurology_merged", "CXCL12:P48061:OID20464:v1:Inflammation_merged", 
          "CXCL3:P19876:OID20788:v1:Inflammation_merged", "CXCL5:P42830:OID20207:v1:Cardiometabolic_merged", 
          "CXCL6:P80162:OID20613:v1:Inflammation_merged"),
  MR3 = c("CCL15:Q16663:OID20328:v1:Cardiometabolic_merged", "CCL16:O15467:OID20334:v1:Cardiometabolic_merged", 
          "CCL23:P55773:OID20693:v1:Inflammation_merged", "CCL25:O15444:OID20674:v1:Inflammation_merged", 
          "CCL27:Q9Y4X3:OID20121:v1:Cardiometabolic_merged", "TAFA5:Q7Z5A7:OID21321:v1:Oncology_merged"),
  MR2 = c("CCL18:P55774:OID20395:v1:Cardiometabolic_merged", "CCL19:Q99731:OID21030:v1:Neurology_merged", 
          "CCL20:P78556:OID20671:v1:Inflammation_merged", "CCL21:O00585:OID20686:v1:Inflammation_merged", 
          "CCL22:O00626:OID20765:v1:Inflammation_merged", "CCL24:O00175:OID20772:v1:Inflammation_merged", 
          "CCL3:P10147:OID20610:v1:Inflammation_merged", "CCL4:P13236:OID20695:v1:Inflammation_merged", 
          "CXCL13:O43927:OID21024:v1:Neurology_merged", "CXCL17:Q6UXB2:OID20622:v1:Inflammation_merged"),
  MR5 = c("CCL2:P13500:OID21004:v1:Neurology_merged", "CXCL16:Q9H2A7:OID20282:v1:Cardiometabolic_merged"),
  MR4 = c("CXCL10:P02778:OID20697:v1:Inflammation_merged", "CXCL14:O95715:OID20458:v1:Inflammation_merged", 
          "CXCL9:Q07325:OID20675:v1:Inflammation_merged"),
  MR6 = c("CXCL8:P10145:OID20153:v1:Cardiometabolic_merged", "CXCL8:P10145:OID20631:v1:Inflammation_merged", 
          "CXCL8:P10145:OID20997:v1:Neurology_merged", "CXCL8:P10145:OID21430:v1:Oncology_merged")
)

# Function to replace ':' with '_' in names
replace_colon_with_underscore <- function(chemokine_names) {
  gsub(":", "_", chemokine_names)
}

# Apply the function to all chemokines in each factor
chemokine_groupings_new <- lapply(chemokine_groupings, replace_colon_with_underscore)

# Print the new groupings
print(chemokine_groupings_new)

## $MR1
##  [1] "CCL11_P51671_OID20668_v1_Inflammation_merged"   
##  [2] "CCL13_Q99616_OID20655_v1_Inflammation_merged"   
##  [3] "CCL14_Q16627_OID20401_v1_Cardiometabolic_merged"
##  [4] "CCL17_Q92583_OID20745_v1_Inflammation_merged"   
##  [5] "CCL26_Q9Y258_OID20546_v1_Inflammation_merged"   
##  [6] "CCL28_Q9NRJ3_OID20569_v1_Inflammation_merged"   
##  [7] "CCL5_P13501_OID20412_v1_Cardiometabolic_merged" 
##  [8] "CCL7_P80098_OID20523_v1_Inflammation_merged"    
##  [9] "CCL8_P80075_OID21466_v1_Oncology_merged"        
## [10] "CXCL1_P09341_OID20762_v1_Inflammation_merged"   
## [11] "CXCL11_O14625_OID21042_v1_Neurology_merged"     
## [12] "CXCL12_P48061_OID20464_v1_Inflammation_merged"  
## [13] "CXCL3_P19876_OID20788_v1_Inflammation_merged"   
## [14] "CXCL5_P42830_OID20207_v1_Cardiometabolic_merged"
## [15] "CXCL6_P80162_OID20613_v1_Inflammation_merged"   
## 
## $MR3
## [1] "CCL15_Q16663_OID20328_v1_Cardiometabolic_merged"
## [2] "CCL16_O15467_OID20334_v1_Cardiometabolic_merged"
## [3] "CCL23_P55773_OID20693_v1_Inflammation_merged"   
## [4] "CCL25_O15444_OID20674_v1_Inflammation_merged"   
## [5] "CCL27_Q9Y4X3_OID20121_v1_Cardiometabolic_merged"
## [6] "TAFA5_Q7Z5A7_OID21321_v1_Oncology_merged"       
## 
## $MR2
##  [1] "CCL18_P55774_OID20395_v1_Cardiometabolic_merged"
##  [2] "CCL19_Q99731_OID21030_v1_Neurology_merged"      
##  [3] "CCL20_P78556_OID20671_v1_Inflammation_merged"   
##  [4] "CCL21_O00585_OID20686_v1_Inflammation_merged"   
##  [5] "CCL22_O00626_OID20765_v1_Inflammation_merged"   
##  [6] "CCL24_O00175_OID20772_v1_Inflammation_merged"   
##  [7] "CCL3_P10147_OID20610_v1_Inflammation_merged"    
##  [8] "CCL4_P13236_OID20695_v1_Inflammation_merged"    
##  [9] "CXCL13_O43927_OID21024_v1_Neurology_merged"     
## [10] "CXCL17_Q6UXB2_OID20622_v1_Inflammation_merged"  
## 
## $MR5
## [1] "CCL2_P13500_OID21004_v1_Neurology_merged"        
## [2] "CXCL16_Q9H2A7_OID20282_v1_Cardiometabolic_merged"
## 
## $MR4
## [1] "CXCL10_P02778_OID20697_v1_Inflammation_merged"
## [2] "CXCL14_O95715_OID20458_v1_Inflammation_merged"
## [3] "CXCL9_Q07325_OID20675_v1_Inflammation_merged" 
## 
## $MR6
## [1] "CXCL8_P10145_OID20153_v1_Cardiometabolic_merged"
## [2] "CXCL8_P10145_OID20631_v1_Inflammation_merged"   
## [3] "CXCL8_P10145_OID20997_v1_Neurology_merged"      
## [4] "CXCL8_P10145_OID21430_v1_Oncology_merged"

# Optionally, save the results
saveRDS(chemokine_groupings_new, file = "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/chemokine_factor_groupings.rds")

6 Six-factor multivariate GWAS

# Script at:
# cd /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/
# caffeinate -dims nohup Rscript mvldsc_40_chemokines_6_factors_6_cores_correlated_factors.R > mvldsc_40_chemokines_6_factors_6_cores_correlated_factors.log 2>&1 &
# ps -p 86402 -o stat,etime

# Load necessary libraries
library(data.table)
library(GenomicSEM)
library(TwoSampleMR)
library(dplyr)

# Set OpenBLAS, MKL, and OpenMP to 1 thread per process to prevent CPU overload
Sys.setenv(OPENBLAS_NUM_THREADS = "1")
Sys.setenv(MKL_NUM_THREADS = "1")
Sys.setenv(OMP_NUM_THREADS = "1")

# Set number of cores for parallel computing (don't use all available cores)
# Not sure this actually worked though; more than six subprocesses were spawned
options(mc.cores = 6)  # Adjust based on system capacity

# Start timer
start_time <- Sys.time()

# Define paths and files
summary_stats_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/summary_stats_all_40_chemokines.txt"
ldsc_results_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/LDSCoutput40.rds"

# Load summary statistics
summary_stats <- fread(summary_stats_path)

# Garbage collect to free up memory
gc()

# Fix summary stats column names
colnames(summary_stats) <- gsub(":", "_", colnames(summary_stats))

# Load LDSC results
ldsc_results <- readRDS(ldsc_results_path)

# Fix LDSC names (Ensure consistency with summary stats)
rename_with_beta <- function(names_vector) {
  paste0("beta.", gsub(":", "_", names_vector))
}

if (!is.null(attr(ldsc_results$S, "dimnames"))) {
  attr(ldsc_results$S, "dimnames")[[2]] <- rename_with_beta(attr(ldsc_results$S, "dimnames")[[2]])
}
if (!is.null(attr(ldsc_results$S_Stand, "dimnames"))) {
  attr(ldsc_results$S_Stand, "dimnames")[[2]] <- rename_with_beta(attr(ldsc_results$S_Stand, "dimnames")[[2]])
}

# ✅ Check if names match NOW
print("LDSC dimnames after fixing:")
print(attr(ldsc_results$S, "dimnames")[[2]])
print(attr(ldsc_results$S_Stand, "dimnames")[[2]])

print("Summary stats column names:")
print(colnames(summary_stats))

factors <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/chemokine_factor_groupings.rds")

# Correct six_factors list based on chemokine_groupings_new
six_factors <- list(
  MR1 = c("CCL11_P51671_OID20668_v1_Inflammation_merged", "CCL13_Q99616_OID20655_v1_Inflammation_merged", 
          "CCL14_Q16627_OID20401_v1_Cardiometabolic_merged", "CCL17_Q92583_OID20745_v1_Inflammation_merged", 
          "CCL26_Q9Y258_OID20546_v1_Inflammation_merged", "CCL28_Q9NRJ3_OID20569_v1_Inflammation_merged", 
          "CCL5_P13501_OID20412_v1_Cardiometabolic_merged", "CCL7_P80098_OID20523_v1_Inflammation_merged", 
          "CCL8_P80075_OID21466_v1_Oncology_merged", "CXCL1_P09341_OID20762_v1_Inflammation_merged", 
          "CXCL11_O14625_OID21042_v1_Neurology_merged", "CXCL12_P48061_OID20464_v1_Inflammation_merged", 
          "CXCL3_P19876_OID20788_v1_Inflammation_merged", "CXCL5_P42830_OID20207_v1_Cardiometabolic_merged", 
          "CXCL6_P80162_OID20613_v1_Inflammation_merged"),
  
  MR3 = c("CCL15_Q16663_OID20328_v1_Cardiometabolic_merged", "CCL16_O15467_OID20334_v1_Cardiometabolic_merged", 
          "CCL23_P55773_OID20693_v1_Inflammation_merged", "CCL25_O15444_OID20674_v1_Inflammation_merged", 
          "CCL27_Q9Y4X3_OID20121_v1_Cardiometabolic_merged", "TAFA5_Q7Z5A7_OID21321_v1_Oncology_merged"),
  
  MR2 = c("CCL18_P55774_OID20395_v1_Cardiometabolic_merged", "CCL19_Q99731_OID21030_v1_Neurology_merged", 
          "CCL20_P78556_OID20671_v1_Inflammation_merged", "CCL21_O00585_OID20686_v1_Inflammation_merged", 
          "CCL22_O00626_OID20765_v1_Inflammation_merged", "CCL24_O00175_OID20772_v1_Inflammation_merged", 
          "CCL3_P10147_OID20610_v1_Inflammation_merged", "CCL4_P13236_OID20695_v1_Inflammation_merged", 
          "CXCL13_O43927_OID21024_v1_Neurology_merged", "CXCL17_Q6UXB2_OID20622_v1_Inflammation_merged"),
  
  MR5 = c("CCL2_P13500_OID21004_v1_Neurology_merged", "CXCL16_Q9H2A7_OID20282_v1_Cardiometabolic_merged"),
  
  MR4 = c("CXCL10_P02778_OID20697_v1_Inflammation_merged", "CXCL14_O95715_OID20458_v1_Inflammation_merged", 
          "CXCL9_Q07325_OID20675_v1_Inflammation_merged"),
  
  MR6 = c("CXCL8_P10145_OID20153_v1_Cardiometabolic_merged", "CXCL8_P10145_OID20631_v1_Inflammation_merged", 
          "CXCL8_P10145_OID20997_v1_Neurology_merged", "CXCL8_P10145_OID21430_v1_Oncology_merged")
)

# Convert factor definitions into beta-prefixed names
six_factors <- lapply(six_factors, function(x) paste0("beta.", x))

# Generate model string (Six-Factor Structure) with correlations
model_string <- paste0(
  paste0(names(six_factors), " =~ ", sapply(six_factors, paste, collapse = " + "), collapse = "\n"),
  "\n",
  paste0(combn(names(six_factors), 2, FUN = function(x) paste(x, collapse = " ~~ ")), collapse = "\n"),  # Add factor correlations
  "\n",
  paste0(names(six_factors), " ~ SNP", collapse = "\n")
)

# Generate column selection (ALL BETAS + SEs + SNP info)
all_beta_cols <- unlist(six_factors)
all_se_cols <- gsub("^beta\\.", "se.", all_beta_cols)
all_columns <- c("SNP", "CHR", "BP", "MAF", "A1", "A2", all_beta_cols, all_se_cols)

# Ensure only valid columns are used
all_columns <- all_columns[all_columns %in% colnames(summary_stats)]

# Extract data for SIX FACTORS
SNPs2 <- summary_stats[, ..all_columns]

# Run userGWAS with SIX FACTORS
CorrelatedFactors <- userGWAS(
  covstruc = ldsc_results,
  SNPs = SNPs2,
  model = model_string,
  estimation = "DWLS",
  printwarn = TRUE,
  sub = paste0(names(six_factors), "~SNP"),
  toler = FALSE,
  SNPSE = FALSE,
  parallel = TRUE,
  GC = "standard",
  MPI = FALSE,
  smooth_check = TRUE,
  fix_measurement = TRUE,
  Q_SNP = TRUE
)

# Garbage collect again before saving results
gc()

# Save results
result_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/userGWAS_results_6_factors_6_cores.rds"
saveRDS(CorrelatedFactors, file = result_path)
cat("GWAS completed. Results saved to:", result_path, "\n")

# Measure runtime
end_time <- Sys.time()
cat("Total runtime:", round(difftime(end_time, start_time, units = "mins"), 2), "minutes.\n")

CorrelatedFactors <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/userGWAS_results_6_factors_6_cores.rds")

# Optional: Filter low-quality SNPs
CF1_bad <- CorrelatedFactors[[1]][which(CorrelatedFactors[[1]]$Q_SNP_pval < 5e-8),]
write.table(CF1_bad$SNP, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/poor_quality_snps_40.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

# Define low-quality SNPs and the full dataset
low_quality_snps <- CF1_bad  # Subset of SNPs with Q_SNP_pval < 5e-8
all_snps <- CorrelatedFactors[[1]]  # The full SNP dataset

# Parameters
ld_window_size <- 500000  # Proximity threshold in base pairs

# Filter low-quality SNPs for required columns
low_quality_snps <- low_quality_snps %>% select(SNP, CHR, BP)

# Exclude SNPs in LD with low-quality SNPs based on proximity
filtered_snps <- all_snps %>%
  filter(!sapply(1:n(), function(i) {
    any(abs(all_snps$BP[i] - low_quality_snps$BP[low_quality_snps$CHR == all_snps$CHR[i]]) <= ld_window_size)
  }))

# Save the filtered SNPs
write.table(filtered_snps, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/F1_filtered_snps_40.tsv", row.names = FALSE, quote = FALSE, sep = "\t")

filtered_snps <- fread("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/F1_filtered_snps_40.tsv")
filtered_snps <- filtered_snps[order(filtered_snps$CHR),]

# Output results before filtering
significant_snps1 <- filtered_snps[filtered_snps$Pval_Estimate < 5e-08, ]
low_heterogeneity1 <- significant_snps1[significant_snps1$Q_SNP_pval > 0.05, ]
snps_set <- significant_snps1$SNP
low_heterogeneity1_snps_set <- low_heterogeneity1$SNP

# Save the filtered SNPs
write.table(low_heterogeneity1, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/low_heterogeneity1_40.tsv", row.names = FALSE, quote = FALSE, sep = "\t")