The analysis uses Genomic Structural Equation Modeling (GenomicSEM) to run a multivariate GWAS of 40 chemokines. The value of GenomicSEM lies in its ability to enhance statistical power and uncover pleiotropic effects that may remain undetected in pairwise or univariate analyses.
40 chemokine summary statistics; Europeans; both GRCh19/38
I had previously obtained these (see https://rpubs.com/YodaMendel/1243451 for an example of how to programmatically get files from UKB-PPP.
script: /Users/charleenadams/ukbppp/process_40_chemokines.sh
#!/bin/bash
# Directories and variables
TAR_DIR="/Users/charleenadams/ukbppp/chemokine_list"
MAP_DIR="/Users/charleenadams/temp_BI/chemokine_rgs_olink/maps"
OUTPUT_BASE_DIR="/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list"
MERGED_DIR="$OUTPUT_BASE_DIR/merged_files"
LOG_FILE="$OUTPUT_BASE_DIR/processing_log.txt"
# Create required directories
mkdir -p "$OUTPUT_BASE_DIR"
mkdir -p "$MERGED_DIR"
echo "Processing started on $(date)" > "$LOG_FILE"
# Function to process each tar file
process_tar_file() {
local tar_file="$1"
local base_name=$(basename "$tar_file" .tar)
local untarred_dir="$OUTPUT_BASE_DIR/$base_name"
local output_dir="$OUTPUT_BASE_DIR/$base_name"
echo "Processing tar file: $tar_file" | tee -a "$LOG_FILE"
# Untar the file into the correct directory
mkdir -p "$untarred_dir"
tar -xf "$tar_file" -C "$untarred_dir" || {
echo "Failed to untar: $tar_file" | tee -a "$LOG_FILE"
return
}
# Process chromosome files within the untarred directory
mkdir -p "$output_dir"
local processed_files=()
find "$untarred_dir" -type f -name "*.gz" | while read -r chr_file; do
# Skip files that are already processed
if [[ "$chr_file" == *"_rsid_added.gz" || "$chr_file" == *"_with_pval.gz" ]]; then
echo "Skipping already processed file: $chr_file" | tee -a "$LOG_FILE"
continue
fi
local chrom=$(basename "$chr_file" | sed -E 's/.*chr([0-9XY]+).*/\1/')
local map_file="$MAP_DIR/olink_rsid_map_mac5_info03_b0_7_chr${chrom}_patched_v2.tsv.gz"
local output_file="${chr_file%.gz}_rsid_added.gz"
if [[ -n "$chrom" && -f "$map_file" ]]; then
echo "Adding RSID to $chr_file (Chromosome $chrom)" | tee -a "$LOG_FILE"
gunzip -c "$chr_file" | \
awk 'BEGIN {OFS="\t"}
NR==FNR {map[$1]=$4; genpos37[$1]=$5; genpos38[$1]=$6; next}
($3 in map) {print $0, map[$3], genpos37[$3], genpos38[$3]}
!($3 in map) {print $0, ".", ".", "."}' <(gunzip -c "$map_file") - | \
gzip > "$output_file"
if [[ -s "$output_file" ]]; then
echo "RSID added to $output_file" | tee -a "$LOG_FILE"
processed_files+=("$output_file")
# Add P-values if not already added
local pval_file="${output_file%.gz}_with_pval.gz"
if [[ ! -f "$pval_file" ]]; then
gunzip -c "$output_file" | \
awk 'BEGIN {OFS="\t"} NR==1 {print $0, "P"} NR>1 {pval = (10 ^ -$13); print $0, pval}' | \
gzip > "$pval_file"
mv "$pval_file" "$output_dir/$(basename "$pval_file")"
else
echo "P-values already added for $output_file" | tee -a "$LOG_FILE"
fi
else
echo "Warning: RSID addition failed for $chr_file" | tee -a "$LOG_FILE"
fi
else
echo "RSID map file for chromosome $chrom not found." | tee -a "$LOG_FILE"
fi
done
# Merge chromosome files in the output directory
if [[ ${#processed_files[@]} -gt 0 ]]; then
echo "Merging chromosome files in $output_dir" | tee -a "$LOG_FILE"
local merged_file="$MERGED_DIR/${base_name}_merged_rsid_added.gz"
# Extract header from the first file
gunzip -c "${processed_files[0]}" | head -n 1 > temp_header.txt
# Process each file, excluding the header, and append to a temporary content file
for file in "${processed_files[@]}"; do
gunzip -c "$file" | tail -n +2 >> temp_content.txt
done
# Concatenate header and content into the final merged file
cat temp_header.txt temp_content.txt | gzip -c > "$merged_file"
# Clean up temporary files
rm -f temp_header.txt temp_content.txt
if [[ -s "$merged_file" ]]; then
echo "Merged file created: $merged_file" | tee -a "$LOG_FILE"
else
echo "Warning: Merged file is empty for $output_dir" | tee -a "$LOG_FILE"
fi
else
echo "No files to merge in $output_dir" | tee -a "$LOG_FILE"
fi
}
# Export the function and variables for parallel processing
export -f process_tar_file
export MAP_DIR OUTPUT_BASE_DIR LOG_FILE MERGED_DIR
# Find all tar files and process them in parallel
find "$TAR_DIR" -type f -name "*.tar" | parallel -j15 process_tar_file
echo "Processing completed on $(date)" >> "$LOG_FILE"
echo "All results saved in $OUTPUT_BASE_DIR and merged files in $MERGED_DIR"
# tmux new -s my_session # didn't use tmux; keeping command to trigger memory
# nohup ./process_40_chemokines.sh > process_40_chemokines_log 2>&1 &
# pstree 91532
# ps -p 91532 -o stat,etime
# watch -n 1 ls -lt /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list
# find /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list -type f -name "*with_pval.gz" | wc -l
# remove the extra files
main_dir="/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list"
clean_directory() {
local dir="$1"
find "$dir" -type f -not -name "*_rsid_added_with_pval.gz" -not -name "*.txt" -delete
echo "Processed: $dir"
}
find "$main_dir" -type d | while read -r dir; do
clean_directory "$dir"
done
library(GenomicSEM)
library(parallel)
# Set directories
base_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list"
hm3_snplist <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/w_hm3.snplist"
output_directory <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40"
# Ensure the output directory exists
if (!dir.exists(output_directory)) {
dir.create(output_directory, recursive = TRUE)
}
# Define column names as a named list
column_names <- list(
SNP = "rsid", # The column for SNP identifiers
A1 = "ALLELE1", # The column for the first allele
A2 = "ALLELE0", # The column for the second allele
effect = "BETA", # The column for the effect size
INFO = "INFO", # The column for the INFO score
P = "P", # The column for the P-value
N = "N", # The column for the sample size
MAF = "A1FREQ", # The column for the minor allele frequency
Z = "CHISQ" # The column for the Z-score or chi-squared value
)
# Function to extract `N` from the first row of a file
extract_N <- function(file, column) {
header <- read.table(gzfile(file), nrows = 1, header = TRUE)
if (column %in% colnames(header)) {
N <- unique(header[[column]])
if (length(N) == 1) {
return(as.numeric(N))
} else {
stop(paste("N column has multiple values in file:", file))
}
} else {
stop(paste("Column", column, "not found in file:", file))
}
}
# Function to process a single subdirectory
process_subdirectory <- function(subdir) {
tryCatch({
# Get subdirectory name
subdir_name <- basename(subdir)
# Create output subdirectory
subdir_output <- file.path(output_directory, subdir_name)
if (!dir.exists(subdir_output)) {
dir.create(subdir_output, recursive = TRUE)
}
# Get the list of chromosome-specific files for chromosomes 1–22
chr_files <- list.files(subdir, full.names = TRUE, pattern = "chr[1-9]|chr1[0-9]|chr2[0-2]_.*rsid_added_with_pval\\.gz$")
if (length(chr_files) == 0) {
cat(paste("No files found in subdirectory:", subdir, "\n"))
return(NULL)
}
# Extract `N` for each file
N_values <- sapply(chr_files, extract_N, column = column_names$N)
# Define trait names based on file names
trait_names <- gsub("_rsid_added_with_pval\\.gz$", "", basename(chr_files))
# Run the munge function with internal parallelization
munge(
files = chr_files,
hm3 = hm3_snplist,
trait.names = trait_names,
N = N_values,
info.filter = 0.9, # Filter by INFO score
maf.filter = 0.01, # Filter by MAF
column.names = column_names,
parallel = TRUE, # Enable internal parallelization
cores = detectCores() - 1, # Use all available cores minus one
overwrite = TRUE # Allow overwriting of existing files
)
cat(paste("Processing completed for subdirectory:", subdir, "\n"))
}, error = function(e) {
cat(paste("Error processing subdirectory:", subdir, "\nError message:", e$message, "\n"))
})
}
# Get all subdirectories
subdirs <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)
# Sequentially process subdirectories
for (subdir in subdirs) {
process_subdirectory(subdir)
}
ls -lt /Users/charleenadams/ukbppp/*.sumstats.gz | grep "Dec 8" | wc -l
# Without notes:
directory="/Users/charleenadams/ukbppp"
target_date="Dec 8"
output_dir="/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40"
mkdir -p "$output_dir"
while IFS= read -r file; do
cp "$file" "$output_dir"
done < <(ls -lt "$directory"/*.sumstats.gz | grep "$target_date" | awk '{print $NF}')
ls -lt /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/*.sumstats.gz | grep "Dec 8" | wc -l
# Run with:
cd /Users/charleenadams/ukbppp
nohup Rscript merged_munged_40.R > merged_munged_40.log 2>&1 &
library(parallel)
# Define directories
base_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40"
results_dir <- file.path(base_dir, "merged_munged_protein_results")
# Ensure the results directory exists
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
# Function to merge chromosome files for a protein
merge_chromosome_files <- function(protein_name, chr_files, output_dir) {
tryCatch({
# Define output file path
merged_file <- file.path(output_dir, paste0(protein_name, "_merged.sumstats.gz"))
# Merge chromosome files
all_data <- do.call(rbind, lapply(chr_files, function(f) {
read.table(f, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
}))
# Save the merged file
write.table(
all_data,
file = gzfile(merged_file),
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
cat(paste("Merged protein:", protein_name, "to", merged_file, "\n"))
return(merged_file)
}, error = function(e) {
cat(paste("Error merging protein:", protein_name, "\nError message:", e$message, "\n"))
return(NULL)
})
}
# Get list of all chromosome files
chr_files <- list.files(base_dir, pattern = "discovery_chr.*\\.sumstats\\.gz$", full.names = TRUE)
# Extract unique protein identifiers
proteins <- unique(gsub("discovery_chr[0-9]+_", "", gsub("\\.sumstats\\.gz$", "", basename(chr_files))))
# Function to process a single protein
process_protein <- function(protein) {
protein_files <- grep(protein, chr_files, value = TRUE)
merge_chromosome_files(protein, protein_files, output_dir = results_dir)
}
# Run merging in parallel for 12 proteins
num_cores <- detectCores() - 1 # Use all but one core
cl <- makeCluster(num_cores)
clusterExport(cl, c("chr_files", "proteins", "merge_chromosome_files", "results_dir"))
clusterEvalQ(cl, library(parallel))
merged_files <- parLapply(cl, proteins, process_protein)
stopCluster(cl)
# Print summary
cat("Merging completed for all proteins.\n")
# nohup Rscript merged_munged_40.R > merged_munged_40.log 2>&1 &
Run with:
nohup Rscript add_p_sumstats.R > add_p_sumstats.log 2>&1 &
library(data.table)
# Define the function to add a P column
add_p_column <- function(file, output_dir) {
tryCatch({
data <- fread(file)
if (!"P" %in% colnames(data)) {
if ("Z" %in% colnames(data)) {
# Calculate P-values from Z-scores
data[, P := 2 * pnorm(-abs(Z))]
# Create output file path
output_file <- file.path(output_dir, basename(file))
# Save the updated data to the new file
fwrite(data, output_file, sep = "\t")
cat("Added P column to:", output_file, "\n")
} else {
stop("Z column missing. Cannot calculate P-values in file:", file)
}
} else {
cat("P column already exists in:", file, "\n")
}
}, error = function(e) {
cat("Error processing file:", file, "\nError message:", e$message, "\n")
})
}
# Set input and output directories
input_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results"
output_dir <- file.path(input_dir, "pvals_fix")
# Create the output directory if it doesn't exist
if (!dir.exists(output_dir)) {
dir.create(output_dir)
}
# List all files matching the pattern in the input directory
files <- list.files(input_dir, pattern = "\\.sumstats\\.gz$", full.names = TRUE)
# Apply the function to all files
lapply(files, add_p_column, output_dir = output_dir)
# Run interactively in Rstudio
library(data.table)
library(GenomicSEM)
# Step 1: Define paths and variables
input_dir <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix" # Input directory containing sumstats files
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/summary_stats_all_40_chemokines.txt" # Output file for all processed summary stats
ref <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/reference.1000G.maf.0.005.txt" # Reference file for SNP variance
# Step 2: Get a list of all .sumstats.gz files in the input directory
trait_files <- list.files(input_dir, pattern = "\\.sumstats\\.gz$", full.names = TRUE)
# Check if any files were found
if (length(trait_files) == 0) {
stop("No .sumstats.gz files found in the input directory.")
}
# Extract trait names from filenames (without .sumstats.gz extension)
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(trait_files))
cat("Processing traits:\n", paste(trait.names, collapse = "\n"), "\n")
# Step 3: Extract sample sizes (N) from each trait file
get_sample_size <- function(file) {
data <- fread(file, select = "N", nrows = 1) # Only read the first row to extract N
return(data$N[1])
}
N <- sapply(trait_files, get_sample_size)
cat("Extracted sample sizes:\n", paste(N, collapse = "\n"), "\n")
# Step 4: Prepare the summary statistics with sumstats()
# Define arguments for sumstats()
se.logit <- rep(FALSE, length(trait_files)) # Continuous traits; SEs are not logistic
OLS <- rep(TRUE, length(trait_files)) # Ordinary least squares for continuous traits
linprob <- rep(FALSE, length(trait_files)) # Traits are not dichotomous; linear probability does not apply
# Run sumstats() to align and scale summary statistics
aligned_sumstats <- sumstats(
files = trait_files,
ref = ref,
trait.names = trait.names,
se.logit = se.logit,
OLS = OLS,
linprob = linprob,
N = N,
betas = NULL,
#maf.filter = 0.01,
keep.indel = FALSE,
parallel = TRUE,
cores = 16 # Adjust for available cores
)
# Save the aligned summary statistics
fwrite(aligned_sumstats, file = output_path, sep = "\t")
cat("Aligned summary statistics saved to:", output_path, "\n")
# Optional: Verify the saved file by re-loading it
summary_stats_read <- fread(output_path)
print(head(summary_stats_read))
# To run:
cd /Users/charleenadams/ukbppp
nohup Rscript ldsc40.R > ldsc40.log 2>&1 &
# Load required library
library(GenomicSEM)
library(data.table)
library(parallel)
library(qqman)
# Define paths and variables
traits <- list.files("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix",
pattern = "\\.sumstats\\.gz$", full.names = TRUE)
# Set sample and population prevalences to NA to bypass liability scaling
sample.prev <- rep(NA, length(traits)) # Treat traits as continuous
population.prev <- rep(NA, length(traits))
ld <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/eur_w_ld_chr/"
wld <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/eur_w_ld_chr/"
# Optional: Name your traits based on filenames without extensions
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(traits))
head(trait.names)
# Redirect console output to a log file
sink("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_40_rgs.log")
# Run multivariable LDSC
LDSCoutput <- ldsc(traits = traits,
sample.prev = sample.prev,
population.prev = population.prev,
ld = ld,
wld = wld,
trait.names = trait.names,
stand = TRUE)
# Stop capturing the console output
sink()
# Save the results
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs"
saveRDS(LDSCoutput, file = file.path(output_path, "LDSCoutput40.rds"))
# Print completion message
cat("Multivariable LDSC analysis complete and save in:\n", output_path, "\n")
Uses parallel analysis (paLDSC) to determine number of factors to extract from a genetic correlation matrix: https://rpubs.com/JaFuente/paLDSC
library(GenomicSEM)
library(knitr)
library(dplyr)
library(kableExtra)
LDSCoutputfull <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/LDSCoutput40.rds")
#2. Genetic correlation matrix from LDSC output
S_Stand <- LDSCoutputfull$S_Stand
#3. Associated standardized multivariate sampling distribution matrix from LDSC output
V_Stand <- LDSCoutputfull$V_Stand
# Save the plot as a 600 DPI PNG
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/paLDSC_plot_40.png"
png(filename = output_path, width = 8, height = 6, units = "in", res = 600)
# Run the `paLDSC` function
paLDSC(S = S_Stand, V = V_Stand, r = 10)
## Running parallel analysis. Replication number: 1
## Running parallel analysis. Replication number: 2
## Running parallel analysis. Replication number: 3
## Running parallel analysis. Replication number: 4
## Running parallel analysis. Replication number: 5
## Running parallel analysis. Replication number: 6
## Running parallel analysis. Replication number: 7
## Running parallel analysis. Replication number: 8
## Running parallel analysis. Replication number: 9
## Running parallel analysis. Replication number: 10
## ------------------------------------------------------------------------
## Parallel Analysis suggests extracting 6 components
## ------------------------------------------------------------------------
# Close the graphics device
dev.off()
## quartz_off_screen
## 2
# Perform factor analysis with 6 components
fa_results <- psych::fa(S_Stand, nfactors = 6, rotate = "varimax")
# Print loadings
#print(fa_results$loadings)
# Assuming fa_results$loadings is your factor loading matrix
loadings_df <- as.data.frame(unclass(fa_results$loadings))
# Process the chemokine names
loadings_df$Protein <- rownames(loadings_df)
loadings_df$Protein <- sapply(strsplit(loadings_df$Protein, ":"), function(x) paste(x[1:1], collapse = ":"))
# Order by highest loading on first factor for clarity
ordered_loadings <- loadings_df[order(-abs(loadings_df$MR1)), ]
# Select top loadings for each factor
top_loadings <- ordered_loadings %>%
mutate(across(MR1:MR5, ~ifelse(abs(.) > 0.3, ., NA))) %>%
dplyr::select(Protein, everything())
# Remove row names for the final output
rownames(top_loadings) <- NULL
# Display in a nice table (you might want to adjust the threshold or how many to show)
kable(top_loadings, "html", digits = 3,
caption = "Top Factor Loadings for Chemokines",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, position = "center")
| Protein | MR1 | MR2 | MR6 | MR3 | MR4 | MR5 |
|---|---|---|---|---|---|---|
| CCL5 | 0.830 | NA | NA | NA | NA | NA |
| CXCL3 | 0.779 | NA | 0.406 | NA | NA | NA |
| CCL17 | 0.768 | NA | NA | NA | NA | NA |
| CCL8 | 0.704 | NA | NA | NA | NA | NA |
| CCL26 | 0.701 | NA | NA | NA | NA | NA |
| CCL13 | 0.690 | NA | NA | NA | NA | 0.353 |
| CXCL5 | 0.688 | NA | 0.370 | NA | NA | NA |
| CXCL1 | 0.665 | NA | 0.442 | NA | NA | NA |
| CCL7 | 0.637 | 0.567 | 0.355 | NA | NA | NA |
| CXCL6 | 0.615 | NA | 0.452 | NA | NA | NA |
| CCL14 | 0.568 | NA | NA | 0.469 | NA | 0.337 |
| CXCL11 | 0.559 | 0.353 | NA | NA | 0.364 | NA |
| CXCL12 | 0.503 | NA | NA | NA | NA | NA |
| CCL28 | 0.478 | NA | NA | NA | NA | NA |
| CCL22 | 0.475 | 0.552 | NA | NA | NA | NA |
| CCL11 | 0.460 | NA | 0.329 | NA | NA | 0.342 |
| CCL27 | 0.417 | NA | NA | 0.463 | NA | NA |
| CCL4 | 0.405 | 0.468 | NA | NA | NA | NA |
| CCL2 | 0.385 | NA | 0.309 | NA | NA | 0.605 |
| CXCL8 | 0.368 | NA | 0.905 | NA | NA | NA |
| CXCL8 | 0.364 | NA | 0.906 | NA | NA | NA |
| CXCL8 | 0.358 | NA | 0.907 | NA | NA | NA |
| CXCL8 | 0.345 | NA | 0.919 | NA | NA | NA |
| CXCL13 | 0.327 | 0.637 | NA | NA | 0.335 | -0.370 |
| CCL3 | 0.310 | 0.597 | NA | 0.362 | NA | NA |
| CCL24 | NA | 0.322 | NA | NA | NA | NA |
| CCL18 | NA | 0.686 | NA | NA | NA | NA |
| CCL19 | NA | 0.816 | NA | NA | NA | NA |
| CCL23 | NA | 0.359 | NA | 0.653 | NA | NA |
| CXCL10 | NA | 0.518 | NA | NA | 0.541 | NA |
| CCL20 | NA | 0.717 | NA | NA | NA | NA |
| CXCL17 | NA | 0.399 | NA | NA | NA | NA |
| CCL21 | NA | 0.672 | NA | NA | NA | NA |
| CXCL16 | NA | NA | NA | NA | NA | 0.367 |
| CXCL9 | NA | 0.307 | NA | NA | 0.715 | NA |
| CCL16 | NA | 0.534 | NA | 0.597 | NA | NA |
| CCL25 | NA | NA | NA | 0.578 | 0.395 | NA |
| CCL15 | NA | NA | NA | 0.730 | NA | NA |
| CXCL14 | NA | 0.484 | NA | NA | -0.535 | NA |
| TAFA5 | NA | NA | NA | 0.582 | NA | 0.306 |
# Group them
# Load necessary libraries
library(psych)
# Assuming LDSCoutputfull is loaded
# If not, load it with:
# LDSCoutputfull <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/LDSCoutput40.rds")
# Extract standardized genetic correlation matrix
S_Stand <- LDSCoutputfull$S_Stand
# Perform factor analysis with 6 components (as suggested by parallel analysis)
fa_results <- fa(S_Stand, nfactors = 6, rotate = "varimax")
# Extract loadings
loadings <- fa_results$loadings
# Function to assign chemokines to factors based on highest loading
assign_to_factor <- function(loadings_matrix, threshold = 0.3) {
chemokines <- rownames(loadings_matrix)
factors <- colnames(loadings_matrix)
assignments <- list()
for(chemokine in chemokines) {
load <- abs(loadings_matrix[chemokine, ])
max_load <- max(load)
if(max_load >= threshold) {
max_factor <- factors[which.max(load)]
if(is.null(assignments[[max_factor]])) {
assignments[[max_factor]] <- c()
}
assignments[[max_factor]] <- c(assignments[[max_factor]], chemokine)
}
}
return(assignments)
}
# Assign chemokines to factors with a loading threshold
chemokine_groupings <- assign_to_factor(loadings, threshold = 0.3)
# Print the groupings
#print(chemokine_groupings)
# Assuming 'chemokine_groupings' is the list with chemokines grouped by factors
chemokine_groupings <- list(
MR1 = c("CCL11:P51671:OID20668:v1:Inflammation_merged", "CCL13:Q99616:OID20655:v1:Inflammation_merged",
"CCL14:Q16627:OID20401:v1:Cardiometabolic_merged", "CCL17:Q92583:OID20745:v1:Inflammation_merged",
"CCL26:Q9Y258:OID20546:v1:Inflammation_merged", "CCL28:Q9NRJ3:OID20569:v1:Inflammation_merged",
"CCL5:P13501:OID20412:v1:Cardiometabolic_merged", "CCL7:P80098:OID20523:v1:Inflammation_merged",
"CCL8:P80075:OID21466:v1:Oncology_merged", "CXCL1:P09341:OID20762:v1:Inflammation_merged",
"CXCL11:O14625:OID21042:v1:Neurology_merged", "CXCL12:P48061:OID20464:v1:Inflammation_merged",
"CXCL3:P19876:OID20788:v1:Inflammation_merged", "CXCL5:P42830:OID20207:v1:Cardiometabolic_merged",
"CXCL6:P80162:OID20613:v1:Inflammation_merged"),
MR3 = c("CCL15:Q16663:OID20328:v1:Cardiometabolic_merged", "CCL16:O15467:OID20334:v1:Cardiometabolic_merged",
"CCL23:P55773:OID20693:v1:Inflammation_merged", "CCL25:O15444:OID20674:v1:Inflammation_merged",
"CCL27:Q9Y4X3:OID20121:v1:Cardiometabolic_merged", "TAFA5:Q7Z5A7:OID21321:v1:Oncology_merged"),
MR2 = c("CCL18:P55774:OID20395:v1:Cardiometabolic_merged", "CCL19:Q99731:OID21030:v1:Neurology_merged",
"CCL20:P78556:OID20671:v1:Inflammation_merged", "CCL21:O00585:OID20686:v1:Inflammation_merged",
"CCL22:O00626:OID20765:v1:Inflammation_merged", "CCL24:O00175:OID20772:v1:Inflammation_merged",
"CCL3:P10147:OID20610:v1:Inflammation_merged", "CCL4:P13236:OID20695:v1:Inflammation_merged",
"CXCL13:O43927:OID21024:v1:Neurology_merged", "CXCL17:Q6UXB2:OID20622:v1:Inflammation_merged"),
MR5 = c("CCL2:P13500:OID21004:v1:Neurology_merged", "CXCL16:Q9H2A7:OID20282:v1:Cardiometabolic_merged"),
MR4 = c("CXCL10:P02778:OID20697:v1:Inflammation_merged", "CXCL14:O95715:OID20458:v1:Inflammation_merged",
"CXCL9:Q07325:OID20675:v1:Inflammation_merged"),
MR6 = c("CXCL8:P10145:OID20153:v1:Cardiometabolic_merged", "CXCL8:P10145:OID20631:v1:Inflammation_merged",
"CXCL8:P10145:OID20997:v1:Neurology_merged", "CXCL8:P10145:OID21430:v1:Oncology_merged")
)
# Function to replace ':' with '_' in names
replace_colon_with_underscore <- function(chemokine_names) {
gsub(":", "_", chemokine_names)
}
# Apply the function to all chemokines in each factor
chemokine_groupings_new <- lapply(chemokine_groupings, replace_colon_with_underscore)
# Print the new groupings
print(chemokine_groupings_new)
## $MR1
## [1] "CCL11_P51671_OID20668_v1_Inflammation_merged"
## [2] "CCL13_Q99616_OID20655_v1_Inflammation_merged"
## [3] "CCL14_Q16627_OID20401_v1_Cardiometabolic_merged"
## [4] "CCL17_Q92583_OID20745_v1_Inflammation_merged"
## [5] "CCL26_Q9Y258_OID20546_v1_Inflammation_merged"
## [6] "CCL28_Q9NRJ3_OID20569_v1_Inflammation_merged"
## [7] "CCL5_P13501_OID20412_v1_Cardiometabolic_merged"
## [8] "CCL7_P80098_OID20523_v1_Inflammation_merged"
## [9] "CCL8_P80075_OID21466_v1_Oncology_merged"
## [10] "CXCL1_P09341_OID20762_v1_Inflammation_merged"
## [11] "CXCL11_O14625_OID21042_v1_Neurology_merged"
## [12] "CXCL12_P48061_OID20464_v1_Inflammation_merged"
## [13] "CXCL3_P19876_OID20788_v1_Inflammation_merged"
## [14] "CXCL5_P42830_OID20207_v1_Cardiometabolic_merged"
## [15] "CXCL6_P80162_OID20613_v1_Inflammation_merged"
##
## $MR3
## [1] "CCL15_Q16663_OID20328_v1_Cardiometabolic_merged"
## [2] "CCL16_O15467_OID20334_v1_Cardiometabolic_merged"
## [3] "CCL23_P55773_OID20693_v1_Inflammation_merged"
## [4] "CCL25_O15444_OID20674_v1_Inflammation_merged"
## [5] "CCL27_Q9Y4X3_OID20121_v1_Cardiometabolic_merged"
## [6] "TAFA5_Q7Z5A7_OID21321_v1_Oncology_merged"
##
## $MR2
## [1] "CCL18_P55774_OID20395_v1_Cardiometabolic_merged"
## [2] "CCL19_Q99731_OID21030_v1_Neurology_merged"
## [3] "CCL20_P78556_OID20671_v1_Inflammation_merged"
## [4] "CCL21_O00585_OID20686_v1_Inflammation_merged"
## [5] "CCL22_O00626_OID20765_v1_Inflammation_merged"
## [6] "CCL24_O00175_OID20772_v1_Inflammation_merged"
## [7] "CCL3_P10147_OID20610_v1_Inflammation_merged"
## [8] "CCL4_P13236_OID20695_v1_Inflammation_merged"
## [9] "CXCL13_O43927_OID21024_v1_Neurology_merged"
## [10] "CXCL17_Q6UXB2_OID20622_v1_Inflammation_merged"
##
## $MR5
## [1] "CCL2_P13500_OID21004_v1_Neurology_merged"
## [2] "CXCL16_Q9H2A7_OID20282_v1_Cardiometabolic_merged"
##
## $MR4
## [1] "CXCL10_P02778_OID20697_v1_Inflammation_merged"
## [2] "CXCL14_O95715_OID20458_v1_Inflammation_merged"
## [3] "CXCL9_Q07325_OID20675_v1_Inflammation_merged"
##
## $MR6
## [1] "CXCL8_P10145_OID20153_v1_Cardiometabolic_merged"
## [2] "CXCL8_P10145_OID20631_v1_Inflammation_merged"
## [3] "CXCL8_P10145_OID20997_v1_Neurology_merged"
## [4] "CXCL8_P10145_OID21430_v1_Oncology_merged"
# Optionally, save the results
saveRDS(chemokine_groupings_new, file = "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/chemokine_factor_groupings.rds")
# Script at:
# cd /Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/
# caffeinate -dims nohup Rscript mvldsc_40_chemokines_6_factors_6_cores_correlated_factors.R > mvldsc_40_chemokines_6_factors_6_cores_correlated_factors.log 2>&1 &
# ps -p 86402 -o stat,etime
# Load necessary libraries
library(data.table)
library(GenomicSEM)
library(TwoSampleMR)
library(dplyr)
# Set OpenBLAS, MKL, and OpenMP to 1 thread per process to prevent CPU overload
Sys.setenv(OPENBLAS_NUM_THREADS = "1")
Sys.setenv(MKL_NUM_THREADS = "1")
Sys.setenv(OMP_NUM_THREADS = "1")
# Set number of cores for parallel computing (don't use all available cores)
# Not sure this actually worked though; more than six subprocesses were spawned
options(mc.cores = 6) # Adjust based on system capacity
# Start timer
start_time <- Sys.time()
# Define paths and files
summary_stats_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/summary_stats_all_40_chemokines.txt"
ldsc_results_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/LDSCoutput40.rds"
# Load summary statistics
summary_stats <- fread(summary_stats_path)
# Garbage collect to free up memory
gc()
# Fix summary stats column names
colnames(summary_stats) <- gsub(":", "_", colnames(summary_stats))
# Load LDSC results
ldsc_results <- readRDS(ldsc_results_path)
# Fix LDSC names (Ensure consistency with summary stats)
rename_with_beta <- function(names_vector) {
paste0("beta.", gsub(":", "_", names_vector))
}
if (!is.null(attr(ldsc_results$S, "dimnames"))) {
attr(ldsc_results$S, "dimnames")[[2]] <- rename_with_beta(attr(ldsc_results$S, "dimnames")[[2]])
}
if (!is.null(attr(ldsc_results$S_Stand, "dimnames"))) {
attr(ldsc_results$S_Stand, "dimnames")[[2]] <- rename_with_beta(attr(ldsc_results$S_Stand, "dimnames")[[2]])
}
# ✅ Check if names match NOW
print("LDSC dimnames after fixing:")
print(attr(ldsc_results$S, "dimnames")[[2]])
print(attr(ldsc_results$S_Stand, "dimnames")[[2]])
print("Summary stats column names:")
print(colnames(summary_stats))
factors <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/ldsc_results_rgs/chemokine_factor_groupings.rds")
# Correct six_factors list based on chemokine_groupings_new
six_factors <- list(
MR1 = c("CCL11_P51671_OID20668_v1_Inflammation_merged", "CCL13_Q99616_OID20655_v1_Inflammation_merged",
"CCL14_Q16627_OID20401_v1_Cardiometabolic_merged", "CCL17_Q92583_OID20745_v1_Inflammation_merged",
"CCL26_Q9Y258_OID20546_v1_Inflammation_merged", "CCL28_Q9NRJ3_OID20569_v1_Inflammation_merged",
"CCL5_P13501_OID20412_v1_Cardiometabolic_merged", "CCL7_P80098_OID20523_v1_Inflammation_merged",
"CCL8_P80075_OID21466_v1_Oncology_merged", "CXCL1_P09341_OID20762_v1_Inflammation_merged",
"CXCL11_O14625_OID21042_v1_Neurology_merged", "CXCL12_P48061_OID20464_v1_Inflammation_merged",
"CXCL3_P19876_OID20788_v1_Inflammation_merged", "CXCL5_P42830_OID20207_v1_Cardiometabolic_merged",
"CXCL6_P80162_OID20613_v1_Inflammation_merged"),
MR3 = c("CCL15_Q16663_OID20328_v1_Cardiometabolic_merged", "CCL16_O15467_OID20334_v1_Cardiometabolic_merged",
"CCL23_P55773_OID20693_v1_Inflammation_merged", "CCL25_O15444_OID20674_v1_Inflammation_merged",
"CCL27_Q9Y4X3_OID20121_v1_Cardiometabolic_merged", "TAFA5_Q7Z5A7_OID21321_v1_Oncology_merged"),
MR2 = c("CCL18_P55774_OID20395_v1_Cardiometabolic_merged", "CCL19_Q99731_OID21030_v1_Neurology_merged",
"CCL20_P78556_OID20671_v1_Inflammation_merged", "CCL21_O00585_OID20686_v1_Inflammation_merged",
"CCL22_O00626_OID20765_v1_Inflammation_merged", "CCL24_O00175_OID20772_v1_Inflammation_merged",
"CCL3_P10147_OID20610_v1_Inflammation_merged", "CCL4_P13236_OID20695_v1_Inflammation_merged",
"CXCL13_O43927_OID21024_v1_Neurology_merged", "CXCL17_Q6UXB2_OID20622_v1_Inflammation_merged"),
MR5 = c("CCL2_P13500_OID21004_v1_Neurology_merged", "CXCL16_Q9H2A7_OID20282_v1_Cardiometabolic_merged"),
MR4 = c("CXCL10_P02778_OID20697_v1_Inflammation_merged", "CXCL14_O95715_OID20458_v1_Inflammation_merged",
"CXCL9_Q07325_OID20675_v1_Inflammation_merged"),
MR6 = c("CXCL8_P10145_OID20153_v1_Cardiometabolic_merged", "CXCL8_P10145_OID20631_v1_Inflammation_merged",
"CXCL8_P10145_OID20997_v1_Neurology_merged", "CXCL8_P10145_OID21430_v1_Oncology_merged")
)
# Convert factor definitions into beta-prefixed names
six_factors <- lapply(six_factors, function(x) paste0("beta.", x))
# Generate model string (Six-Factor Structure) with correlations
model_string <- paste0(
paste0(names(six_factors), " =~ ", sapply(six_factors, paste, collapse = " + "), collapse = "\n"),
"\n",
paste0(combn(names(six_factors), 2, FUN = function(x) paste(x, collapse = " ~~ ")), collapse = "\n"), # Add factor correlations
"\n",
paste0(names(six_factors), " ~ SNP", collapse = "\n")
)
# Generate column selection (ALL BETAS + SEs + SNP info)
all_beta_cols <- unlist(six_factors)
all_se_cols <- gsub("^beta\\.", "se.", all_beta_cols)
all_columns <- c("SNP", "CHR", "BP", "MAF", "A1", "A2", all_beta_cols, all_se_cols)
# Ensure only valid columns are used
all_columns <- all_columns[all_columns %in% colnames(summary_stats)]
# Extract data for SIX FACTORS
SNPs2 <- summary_stats[, ..all_columns]
# Run userGWAS with SIX FACTORS
CorrelatedFactors <- userGWAS(
covstruc = ldsc_results,
SNPs = SNPs2,
model = model_string,
estimation = "DWLS",
printwarn = TRUE,
sub = paste0(names(six_factors), "~SNP"),
toler = FALSE,
SNPSE = FALSE,
parallel = TRUE,
GC = "standard",
MPI = FALSE,
smooth_check = TRUE,
fix_measurement = TRUE,
Q_SNP = TRUE
)
# Garbage collect again before saving results
gc()
# Save results
result_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/userGWAS_results_6_factors_6_cores.rds"
saveRDS(CorrelatedFactors, file = result_path)
cat("GWAS completed. Results saved to:", result_path, "\n")
# Measure runtime
end_time <- Sys.time()
cat("Total runtime:", round(difftime(end_time, start_time, units = "mins"), 2), "minutes.\n")
CorrelatedFactors <- readRDS("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/userGWAS_results_6_factors_6_cores.rds")
# Optional: Filter low-quality SNPs
CF1_bad <- CorrelatedFactors[[1]][which(CorrelatedFactors[[1]]$Q_SNP_pval < 5e-8),]
write.table(CF1_bad$SNP, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/poor_quality_snps_40.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
# Define low-quality SNPs and the full dataset
low_quality_snps <- CF1_bad # Subset of SNPs with Q_SNP_pval < 5e-8
all_snps <- CorrelatedFactors[[1]] # The full SNP dataset
# Parameters
ld_window_size <- 500000 # Proximity threshold in base pairs
# Filter low-quality SNPs for required columns
low_quality_snps <- low_quality_snps %>% select(SNP, CHR, BP)
# Exclude SNPs in LD with low-quality SNPs based on proximity
filtered_snps <- all_snps %>%
filter(!sapply(1:n(), function(i) {
any(abs(all_snps$BP[i] - low_quality_snps$BP[low_quality_snps$CHR == all_snps$CHR[i]]) <= ld_window_size)
}))
# Save the filtered SNPs
write.table(filtered_snps, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/F1_filtered_snps_40.tsv", row.names = FALSE, quote = FALSE, sep = "\t")
filtered_snps <- fread("/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/F1_filtered_snps_40.tsv")
filtered_snps <- filtered_snps[order(filtered_snps$CHR),]
# Output results before filtering
significant_snps1 <- filtered_snps[filtered_snps$Pval_Estimate < 5e-08, ]
low_heterogeneity1 <- significant_snps1[significant_snps1$Q_SNP_pval > 0.05, ]
snps_set <- significant_snps1$SNP
low_heterogeneity1_snps_set <- low_heterogeneity1$SNP
# Save the filtered SNPs
write.table(low_heterogeneity1, "/Users/charleenadams/temp_BI/chemokine_rgs_olink/processed_ukbppp_chemokine_list/munged_40/merged_munged_protein_results/pvals_fix/userGWAS_res/low_heterogeneity1_40.tsv", row.names = FALSE, quote = FALSE, sep = "\t")