userGWAS of Four
Selected Chemokines: CCL5, CXCL5, CCL8, & CCL7In this analysis, we employ a genomic structural equation model (GenomicSEM) to define a new phenotype: a latent factor that influences four CCL5, CXCL5, CCL8, & CCL7.
In essence, while the chemokines themselves are directly measurable phenotypes, the latent factor we infer captures the shared genetic influence among these chemokines. The GWAS results, therefore, represent the genetic associations with this inferred, composite trait, providing insights into the genetics of an underlying immune phenomenon rather than focusing on any single chemokine alone.
F1 =~ CCL5 + CXCL5 + CCL8 + CCL7 means
that each chemokine is an indicator of this latent factor. Essentially,
\(F1\) models an underlying biological
process or trait that influences all four chemokines
simultaneously.F1 ~ SNP means the model tests how each
genetic variant (SNP) contributes to or affects the latent factor \(F1\).This setup is powerful for identifying genetic influences that operate across multiple traits, focusing on what’s shared rather than what’s unique.
NOTE: I’ve left the bioinformatics code out of this document to spare my eyes and focus in on the goal: showcasing what GenomicSEM can do for us with summary statistics. A record of the original bioinformatics steps is in the toy document I initially generated with a model that included 39 chemokines: mvGWAS_chemokines.Rmd.
# Load required library
library(GenomicSEM)
library(data.table)
library(parallel)
library(qqman)
# MVLDSC for CCL5, CXCL5, CCL8, CCL7
# Define paths and variables
traits <- list.files("/Users/charleenadams/temp_BI/chemokine_rgs_olink/munged/harmonized/",
pattern = "\\.sumstats\\.gz$", full.names = TRUE)
# Filter trait.names to include only those with CCL5, CXCL5, CCL8, CCL7
selected_traits <- traits[grepl("CCL5|CXCL5|CCL8|CCL7", traits)]
# View the selected traits
print(selected_traits)
# Set sample and population prevalences to NA to bypass liability scaling
sample.prev <- rep(NA, length(selected_traits)) # Treat traits as continuous
population.prev <- rep(NA, length(selected_traits))
ld <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/eur_w_ld_chr/"
wld <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/eur_w_ld_chr/"
# Optional: Name your traits based on filenames without extensions
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(selected_traits))
# Filter trait.names to include only those with CCL5, CXCL5, CCL8, CCL7
selected.traits.names <- trait.names[grepl("CCL5|CXCL5|CCL8|CCL7", trait.names)]
# View the selected traits
print(selected.traits.names)
# Redirect console output to a log file
sink("/Users/charleenadams/temp_BI/chemokine_rgs_olink/munged/harmonized/CCL5_CXCL5_CCL8_CCL7/CCL5_CXCL5_CCL8_CCL7_ldsc_log.txt")
# Run multivariable LDSC
CCL5_CXCL5_CCL8_CCL7_LDSCoutput <- ldsc(traits = selected_traits,
sample.prev = sample.prev,
population.prev = population.prev,
ld = ld,
wld = wld,
trait.names = selected.traits.names,
stand = TRUE)
# Stop capturing the console output
sink()
# Save the results
output_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/munged/harmonized/CCL5_CXCL5_CCL8_CCL7"
saveRDS(CCL5_CXCL5_CCL8_CCL7_LDSCoutput, file = file.path(output_path, "CCL5_CXCL5_CCL8_CCL7_LDSCoutput.rds"))
# Print completion message
cat("Multivariable LDSC analysis complete and save in:\n", output_path, "\n")
# Sumstats
# Step 1: Define paths and variables
traits <- list.files(
"/Users/charleenadams/temp_BI/chemokine_rgs_olink/munged/harmonized/pvals.fix",
pattern = "\\.sumstats$", # Match only unzipped files
full.names = TRUE
)
# Filter trait.names to include only those with CCL5, CXCL5, CCL8, CCL7
selected_traits <- traits[grepl("CCL5|CXCL5|CCL8|CCL7", traits)]
# View the selected traits
print(selected_traits)
# Optional: Name your traits based on filenames without extensions
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(traits))
# Filter trait.names to include only those with CCL5, CXCL5, CCL8, CCL7
selected.traits.names <- trait.names[grepl("CCL5|CXCL5|CCL8|CCL7", trait.names)]
# View the selected traits
print(selected.traits.names)
# Step 2: Extract sample sizes (N) from each trait file
# Assuming the column name for sample size is "N"
get_sample_size <- function(file) {
data <- fread(file, select = "N", nrows = 1) # Only read the first row to extract N
return(data$N[1])
}
N <- sapply(selected_traits, get_sample_size)
# Print extracted sample sizes for debugging
cat("Extracted sample sizes:\n")
print(N)
# Step 3: Prepare the summary statistics with sumstats()
# Define the reference file for SNP variance
ref <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink//reference.1000G.maf.0.005.txt"
# Define arguments for sumstats()
se.logit <- rep(FALSE, length(selected_traits)) # These are continuous traits; SEs are not logistic
OLS <- rep(TRUE, length(selected_traits)) # OLS is likely because these are continuous traits
linprob <- rep(FALSE, length(selected_traits)) # Traits are not dichotomous; linear probability does not apply
# Run sumstats() to align and scale summary statistics
CCL5_CXCL5_CCL8_CCL7_summary_stats <- sumstats(
files = selected_traits,
ref = ref,
trait.names = selected.traits.names,
se.logit = se.logit,
OLS = OLS,
linprob = linprob,
N = N,
betas = NULL,
# info.filter = 0.6,
maf.filter = 0.01,
keep.indel = FALSE,
parallel = TRUE,
cores = 4 # Adjust for available cores
)
output_path <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/latent_chemokine_sumstats/CCL5_CXCL5_CCL8_CCL7/CCL5_CXCL5_CCL8_CCL7_summary_stats.txt"
fwrite(CCL5_CXCL5_CCL8_CCL7_summary_stats, file = output_path, sep = "\t")
summary_stats <- fread("/Users/charleenadams/temp_BI/mvGWAS_chemokines/latent_chemokine_sumstats/CCL5_CXCL5_CCL8_CCL7/CCL5_CXCL5_CCL8_CCL7_summary_stats.txt")
# userGWAS
# Start timer to measure runtime
start_time <- Sys.time()
# Step 1: Stop and clean up any parallel clusters
if (!is.null(cl <- getOption("cl"))) {
stopCluster(cl)
options(cl = NULL)
}
# Step 1: Define paths and variables
traits <- list.files(
"/Users/charleenadams/temp_BI/chemokine_rgs_olink/munged/harmonized/pvals.fix",
pattern = "\\.sumstats$", # Match only unzipped files
full.names = TRUE
)
# Filter trait.names to include only those with
selected_traits <- traits[grepl("CCL5|CXCL5|CCL8|CCL7", traits)]
# View the selected traits
print(selected_traits)
# Optional: Name your traits based on filenames without extensions
trait.names <- gsub("\\.sumstats\\.gz$", "", basename(traits))
# Filter trait.names to include only those with CCL5, CXCL5, CCL8, CCL7
selected.traits.names <- trait.names[grepl("CCL5|CXCL5|CCL8|CCL7", trait.names)]
# View the selected traits
print(selected.traits.names)
# Extract sample sizes (N) from each trait file
get_sample_size <- function(file) {
data <- fread(file, select = "N", nrows = 1) # Only read the first row to extract N
return(data$N[1])
}
N <- sapply(selected_traits, get_sample_size)
cat("Extracted sample sizes:\n")
# Load summary statistics (use a subset of SNPs for testing)
summary_stats <- fread("/Users/charleenadams/temp_BI/mvGWAS_chemokines/latent_chemokine_sumstats/CCL5_CXCL5_CCL8_CCL7/CCL5_CXCL5_CCL8_CCL7_summary_stats.txt")
# Step 3: Load LDSC results
ldsc_results_path <- "/Users/charleenadams/temp_BI/chemokine_rgs_olink/munged/harmonized/CCL5_CXCL5_CCL8_CCL7/CCL5_CXCL5_CCL8_CCL7_LDSCoutput.rds"
ldsc_results <- readRDS(ldsc_results_path)
# Step 4: Specify the model for userGWAS
ldsc_trait_names <- attr(ldsc_results$S, "dimnames")[[2]]
model_string <- paste0("F1 =~ ", paste(ldsc_trait_names, collapse = " + "), "\nF1 ~ SNP")
# Step 5: Run the Multivariate GWAS
max_cores <- 15
cat("Using", max_cores, "cores for parallel processing.\n")
CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors <- userGWAS(
covstruc = ldsc_results,
SNPs = summary_stats,
model = model_string,
estimation = "DWLS",
printwarn = TRUE,
sub = c("F1~SNP"),
cores = max_cores,
toler = FALSE,
SNPSE = FALSE,
parallel = TRUE,
GC = "standard",
MPI = FALSE,
smooth_check = TRUE,
fix_measurement = TRUE,
Q_SNP = TRUE
)
# Step 7: Output results
result_path <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/results/CCL5_CXCL5_CCL8_CCL7_userGWAS_CorrelatedFactors_results.rds"
saveRDS(CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors, file = result_path)
cat("GWAS completed. Results saved to:", result_path, "\n")
# Optional: Print some results for immediate feedback
print(names(CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors))
# Define the file path
result_path <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/results/CCL5_CXCL5_CCL8_CCL7_userGWAS_CorrelatedFactors_results.rds"
# Read the RDS file
CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors <- readRDS(result_path)
# Inspect the contents
str(CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors) # View the structure of the object
# Output results
significant_snps <- CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]][CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]]$Pval_Estimate < 5e-08, ]
#print(significant_snps)
dim(significant_snps)
low_heterogeneity <- CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]][CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]]$Q_SNP_pval > 0.05, ]
#print(low_heterogeneity)
# Prepare data for plots
manhattan_data <- data.frame(
SNP = CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]]$SNP,
CHR = CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]]$CHR,
BP = CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]]$BP,
P = CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors[[1]]$Pval_Estimate
)
# Save Manhattan plot to file
png(
filename = "/Users/charleenadams/temp_BI/mvGWAS_chemokines/results/Manhattan_CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors_LatentChemokineFactor.png",
width = 8,
height = 6,
units = "in",
res = 300
)
# Step 6: Display Manhattan plot
cat("Displaying Manhattan plot.\n")
manhattan(
manhattan_data,
chr = "CHR",
bp = "BP",
p = "P",
snp = "SNP",
genomewideline = -log10(5e-8),
suggestiveline = -log10(1e-5),
col = c("blue4", "skyblue"),
main = "Manhattan Plot for Latent CCL5/CXCL5/CCL8/CCL7 Chemokine Genetic Factor",
ylim = c(0, 10)
)
# Close the graphics device
dev.off()
# Save QQ plot to file
png(
filename = "/Users/charleenadams/temp_BI/mvGWAS_chemokines/results/QQ_Latent_CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors_ChemokineFactor.png",
width = 8,
height = 6,
units = "in",
res = 300
)
# Step 7: Display QQ plot
cat("Displaying QQ plot.\n")
qq(manhattan_data$P, main = "QQ Plot for Latent CCL5/CXCL5/CCL8/CCL7 Chemokine Genetic Factor")
# Close the graphics device
dev.off()
# End timer and display runtime
end_time <- Sys.time()
cat("Total runtime:", end_time - start_time, "\n")
# Run with:
# tmux new -s my_session (tmux my_session to end)
# caffeinate -i nohup Rscript CCL5_CXCL5_CCL8_CCL7_userGWAS.R > CCL5_CXCL5_CCL8_CCL7_userGWAS_log.txt 2>&1 &
# pstree -p 21557
# ps -p 21557 -o stat,etime
# Path to the saved Manhattan plot
manhattan_plot_path <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/results/Manhattan_CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors_LatentChemokineFactor.png"
# Display the image
knitr::include_graphics(manhattan_plot_path)
Manhattan Plot for Latent CCL5/CXCL5/CCL8/CCL7 Chemokine Genetic Factor
# Path to the saved QQ plot
qq_plot_path <- "/Users/charleenadams/temp_BI/mvGWAS_chemokines/results/QQ_Latent_CCL5_CXCL5_CCL8_CCL7_CorrelatedFactors_ChemokineFactor.png"
# Display the image
knitr::include_graphics(qq_plot_path)
QQ Plot for Latent CCL5/CXCL5/CCL8/CCL7 Chemokine Genetic Factor
Here, we perform a factor analysis to ascertain from the data how many factors likely explain the data.
## Factor Analysis using method = ml
## Call: fa(r = cor_matrix, nfactors = 4, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3
## CCL5_P13501_OID20412_v1_Cardiometabolic_munged_noNA_harmonized 0.85 0.09 0
## CCL7_P80098_OID20523_v1_Inflammation_munged_noNA_harmonized 0.88 -0.06 0
## CCL8_P80075_OID21466_v1_Oncology_munged_noNA_harmonized 0.90 -0.14 0
## CXCL5_P42830_OID20207_v1_Cardiometabolic_munged_noNA_harmonized 0.72 0.25 0
## ML4 h2 u2
## CCL5_P13501_OID20412_v1_Cardiometabolic_munged_noNA_harmonized 0 0.75 0.25
## CCL7_P80098_OID20523_v1_Inflammation_munged_noNA_harmonized 0 0.77 0.23
## CCL8_P80075_OID21466_v1_Oncology_munged_noNA_harmonized 0 0.79 0.21
## CXCL5_P42830_OID20207_v1_Cardiometabolic_munged_noNA_harmonized 0 0.64 0.36
## com
## CCL5_P13501_OID20412_v1_Cardiometabolic_munged_noNA_harmonized 1.0
## CCL7_P80098_OID20523_v1_Inflammation_munged_noNA_harmonized 1.0
## CCL8_P80075_OID21466_v1_Oncology_munged_noNA_harmonized 1.0
## CXCL5_P42830_OID20207_v1_Cardiometabolic_munged_noNA_harmonized 1.2
##
## ML1 ML2 ML3 ML4
## SS loadings 2.84 0.11 0.00 0.00
## Proportion Var 0.71 0.03 0.00 0.00
## Cumulative Var 0.71 0.74 0.74 0.74
## Proportion Explained 0.96 0.04 0.00 0.00
## Cumulative Proportion 0.96 1.00 1.00 1.00
##
## With factor correlations of
## ML1 ML2 ML3 ML4
## ML1 1.00 0.16 0 0
## ML2 0.16 1.00 0 0
## ML3 0.00 0.00 1 0
## ML4 0.00 0.00 0 1
##
## Mean item complexity = 1.1
## Test of the hypothesis that 4 factors are sufficient.
##
## df null model = 6 with the objective function = 2.96
## df of the model are -4 and the objective function was 0.14
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is NA
##
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML2 ML3 ML4
## Correlation of (regression) scores with factors 0.95 0.48 0 0
## Multiple R square of scores with factors 0.91 0.23 0 0
## Minimum correlation of possible factor scores 0.81 -0.54 -1 -1
| Chemokine | Factor Loading |
|---|---|
| CCL5_P13501_OID20412_v1_Cardiometabolic_munged_noNA_harmonized | 0.847 |
| CCL7_P80098_OID20523_v1_Inflammation_munged_noNA_harmonized | 0.884 |
| CCL8_P80075_OID21466_v1_Oncology_munged_noNA_harmonized | 0.899 |
| CXCL5_P42830_OID20207_v1_Cardiometabolic_munged_noNA_harmonized | 0.718 |