Purpose: Demonstrate the fundamental difference between DEG/limma analysis and eQTL/gene-specific genotype analysis.
Core Concept: limma uses a single design matrix for all genes. Gene-specific predictors (like local genotype) cannot be included because they vary per gene. eQTL analysis requires fitting a separate model for each gene.
| Package/Function | Model | Effect Analysis | Why |
|---|---|---|---|
| limma::lmFit | expression ~ Treatment + donor_taxa | ✅ Possible | Sample-level proxy misses recombination |
| limma::lmFit | expression ~ Treatment + geneGT | ❌ IMPOSSIBLE | Cannot vary design matrix per gene |
| MatrixEQTL | expression ~ Treatment + geneGT | ✅ Possible | Correct eQTL approach |
suppressPackageStartupMessages({
library(edgeR)
library(limma)
library(MatrixEQTL)
library(Matrix)
library(dplyr)
library(ggplot2)
library(tidyr)
library(rtracklayer)
library(pheatmap)
})
Purpose: Load expression data, sample metadata, and gene-level genotypes.
Approach: This dataset includes 150 maize samples with:
# Load expression counts
counts <- readRDS(
file.path(
here::here("output", "generate_synthetic_data"),
"synthetic_counts.rds"
)
)
# Load sample metadata
metadata <- read.csv(
file.path(
here::here("output", "generate_synthetic_data"),
"synthetic_metadata.csv"
),
row.names = 1
)
# Load gene-level genotypes
geneGT <- readRDS(
file.path(
here::here("output", "generate_synthetic_data"),
"synthetic_geneGT.rds"
)
)
# Load gene annotations for Manhattan plot
gff <- import(
file.path(
here::here("data"),
"Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
)
)
genes_annot <- subset(gff, type == "gene")
genes_annot$gene_id <- gsub("gene:", "", genes_annot$ID)
# Ensure factor levels
metadata$Treatment <- factor(metadata$Treatment, levels = c("High_P", "Low_P"))
metadata$Inversion <- factor(metadata$Inversion, levels = c("CTRL", "Inv4m"))
# Diagnostics
{
cat("=== Data Loaded ===\n\n")
cat("Expression matrix:", nrow(counts), "genes x", ncol(counts), "samples\n")
cat("Genotype matrix:", nrow(geneGT), "genes x", ncol(geneGT), "samples\n")
cat("Gene annotations:", length(genes_annot), "genes\n")
}
## === Data Loaded ===
##
## Expression matrix: 21597 genes x 150 samples
## Genotype matrix: 21597 genes x 150 samples
## Gene annotations: 43459 genes
Purpose: Verify the experimental design is balanced.
# Create distribution table
sample_table <- table(
Treatment = metadata$Treatment,
Inversion = metadata$Inversion
)
{
cat("=== Sample Distribution ===\n\n")
print(sample_table)
cat("\nTotal samples:", nrow(metadata), "\n")
cat("Treatment balance:", paste(table(metadata$Treatment), collapse = "/"), "\n")
cat("Inversion balance:", paste(table(metadata$Inversion), collapse = "/"), "\n")
print(
with(metadata,
table(donor_taxa, spp)
)
)
print(
with(metadata,
table(spp, Inversion)
)
)
}
## === Sample Distribution ===
##
## Inversion
## Treatment CTRL Inv4m
## High_P 38 37
## Low_P 37 38
##
## Total samples: 150
## Treatment balance: 75/75
## Inversion balance: 75/75
## spp
## donor_taxa mexicana parviglumis
## Bals 0 75
## Chal 40 0
## Mesa 35 0
## Inversion
## spp CTRL Inv4m
## mexicana 0 75
## parviglumis 75 0
Purpose: Visualize sample clustering by Treatment and Inversion status.
Approach: Use limma’s plotMDS to perform multidimensional scaling on the expression data.
# Create DGEList and normalize
y <- DGEList(counts = counts, samples = metadata)
y <- calcNormFactors(y)
# Calculate MDS coordinates
mds <- plotMDS(y, plot = FALSE)
mds_data <- data.frame(
x = mds$x,
y = mds$y,
Treatment = metadata$Treatment,
Inversion = metadata$Inversion,
sample = rownames(metadata)
)
# Create plots
p1 <- ggplot(mds_data, aes(x = x, y = y, color = Treatment)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("High_P" = "steelblue", "Low_P" = "coral")) +
labs(
title = "MDS Plot: Colored by Treatment",
x = paste0("Leading logFC dim 1 (", round(mds$var.explained[1] * 100, 1), "%)"),
y = paste0("Leading logFC dim 2 (", round(mds$var.explained[2] * 100, 1), "%)")
) +
theme_bw() +
theme(legend.position = "bottom")
p2 <- ggplot(mds_data, aes(x = x, y = y, color = Inversion)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = c("CTRL" = "forestgreen", "Inv4m" = "purple")) +
labs(
title = "MDS Plot: Colored by Inversion",
x = paste0("Leading logFC dim 1 (", round(mds$var.explained[1] * 100, 1), "%)"),
y = paste0("Leading logFC dim 2 (", round(mds$var.explained[2] * 100, 1), "%)")
) +
theme_bw() +
theme(legend.position = "bottom")
gridExtra::grid.arrange(p1, p2, ncol = 2)
Purpose: Use donor_taxa (Bals, Mesa, Chal) instead of Inversion to capture more granular population effects.
Key Point: Inversion and donor_taxa are perfectly confounded—all Bals samples are CTRL, all Mesa/Chal are Inv4m. We cannot include both in the same model. Using donor_taxa gives us separate effects for Mesa vs Bals and Chal vs Bals, which is more informative than pooling them into a single Inversion effect.
Equation: \(\log_2(\text{expression}_g) = \beta_0 + \beta_1 \cdot \text{Treatment} + \beta_2 \cdot \text{Mesa} + \beta_3 \cdot \text{Chal} + \epsilon\)
(where Bals is the reference level)
# Ensure donor_taxa has Bals as reference
metadata$donor_taxa <- factor(
metadata$donor_taxa,
levels = c("Bals", "Mesa", "Chal")
)
# Design matrix with donor_taxa
design1 <- model.matrix(~ Treatment + donor_taxa, data = metadata)
{
cat("=== Design Matrix: ~ Treatment + donor_taxa ===\n")
cat("Columns:", colnames(design1), "\n")
cat("\nNote: Bals is the reference level for donor_taxa\n")
print(head(design1, 6))
}
## === Design Matrix: ~ Treatment + donor_taxa ===
## Columns: (Intercept) TreatmentLow_P donor_taxaMesa donor_taxaChal
##
## Note: Bals is the reference level for donor_taxa
## (Intercept) TreatmentLow_P donor_taxaMesa donor_taxaChal
## S001 1 1 0 0
## S002 1 1 0 0
## S003 1 0 0 0
## S004 1 0 0 0
## S005 1 1 0 0
## S006 1 1 0 0
# Fit model
voom1 <- voom(y, design1, plot = FALSE)
fit1 <- lmFit(voom1, design1)
fit1 <- eBayes(fit1)
# Extract results for all coefficients
results1_treatment <- topTable(
fit1, coef = "TreatmentLow_P", n = Inf, sort.by = "none"
)
results1_mesa <- topTable(
fit1, coef = "donor_taxaMesa", n = Inf, sort.by = "none"
)
results1_chal <- topTable(
fit1, coef = "donor_taxaChal", n = Inf, sort.by = "none"
)
results1_treatment$gene <- rownames(results1_treatment)
results1_mesa$gene <- rownames(results1_mesa)
results1_chal$gene <- rownames(results1_chal)
# Diagnostics
{
cat("\n=== Model 3 Results ===\n")
cat("\nTreatment effect (Low_P vs High_P):\n")
cat(" Significant genes (FDR < 0.05):",
sum(results1_treatment$adj.P.Val < 0.05), "\n")
cat("\nMesa vs Bals effect:\n")
cat(" Significant genes (FDR < 0.05):",
sum(results1_mesa$adj.P.Val < 0.05), "\n")
cat(" Mean logFC:", round(mean(results1_mesa$logFC), 3), "\n")
cat("\nChal vs Bals effect:\n")
cat(" Significant genes (FDR < 0.05):",
sum(results1_chal$adj.P.Val < 0.05), "\n")
cat(" Mean logFC:", round(mean(results1_chal$logFC), 3), "\n")
}
##
## === Model 3 Results ===
##
## Treatment effect (Low_P vs High_P):
## Significant genes (FDR < 0.05): 154
##
## Mesa vs Bals effect:
## Significant genes (FDR < 0.05): 18
## Mean logFC: -0.001
##
## Chal vs Bals effect:
## Significant genes (FDR < 0.05): 17
## Mean logFC: 0.002
results1_mesa$significant <- results1_mesa$adj.P.Val < 0.05
results1_chal$significant <- results1_chal$adj.P.Val < 0.05
p1 <- ggplot(results1_mesa, aes(x = logFC, y = -log10(P.Value))) +
geom_point(aes(color = significant), alpha = 0.5, size = 1) +
scale_color_manual(
values = c("FALSE" = "gray60", "TRUE" = "darkorange"),
labels = c("Not significant", "FDR < 0.05")
) +
labs(
title = "Model 3: Mesa vs Bals",
subtitle = "mexicana (Inv4m) vs parviglumis (CTRL)",
x = "log2 Fold Change",
y = "-log10(p-value)"
) +
theme_bw() +
theme(legend.position = "none")
p2 <- ggplot(results1_chal, aes(x = logFC, y = -log10(P.Value))) +
geom_point(aes(color = significant), alpha = 0.5, size = 1) +
scale_color_manual(
values = c("FALSE" = "gray60", "TRUE" = "darkgreen"),
labels = c("Not significant", "FDR < 0.05")
) +
labs(
title = "Model 3: Chal vs Bals",
subtitle = "mexicana (Inv4m) vs parviglumis (CTRL)",
x = "log2 Fold Change",
y = "-log10(p-value)"
) +
theme_bw() +
theme(legend.position = "none")
gridExtra::grid.arrange(p1, p2, ncol = 2)
Purpose: Check if Mesa and Chal show similar effects (both are Inv4m carriers) or if there are population-specific differences.
# Merge Mesa and Chal results
mesa_chal_compare <- data.frame(
gene = results1_mesa$gene,
mesa_logFC = results1_mesa$logFC,
chal_logFC = results1_chal$logFC
)
cor_mesa_chal <- cor(mesa_chal_compare$mesa_logFC, mesa_chal_compare$chal_logFC)
ggplot(mesa_chal_compare, aes(x = mesa_logFC, y = chal_logFC)) +
geom_point(alpha = 0.3, size = 1, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed",
linewidth = 1) +
geom_smooth(method = "lm", se = FALSE, color = "darkblue", linewidth = 0.8) +
labs(
title = "Mesa vs Chal Effects (both Inv4m carriers)",
subtitle = paste("Correlation:", round(cor_mesa_chal, 3),
"| On diagonaal = identical effects"),
x = "Mesa vs Bals (logFC)",
y = "Chal vs Bals (logFC)"
) +
theme_bw()
Purpose: Demonstrate that gene-specific genotype (geneGT) cannot be included in a limma model because the design matrix must be identical for all genes.
Key Insight: Due to recombination, the same sample can have different genotypes at different gene loci. This means geneGT is a gene-specific predictor, not a sample-level predictor.
Purpose: Visualize how genotypes vary across genes and samples.
# Find genes with genotype variation
gene_gt_var <- apply(geneGT, 1, function(x) length(unique(x)) > 1)
genes_with_variation <- names(gene_gt_var)[gene_gt_var]
# Select subset of genes with variation for visualization
subset_genes <- genes_with_variation[1:50]
# Select the first 15 CTRL samples and the first 15 Inv4m samples
ctrl_samples <- rownames(metadata[metadata$Inversion == "CTRL", ])[1:15]
inv4m_samples <- rownames(metadata[metadata$Inversion == "Inv4m", ])[1:15]
subset_samples <- c(ctrl_samples, inv4m_samples)
# Create annotation for samples (now rows)
sample_annot <- data.frame(
Inversion = metadata[subset_samples, "Inversion"],
Treatment = metadata[subset_samples, "Treatment"],
row.names = subset_samples
)
# Define colors
annot_colors <- list(
Inversion = c(CTRL = "forestgreen", Inv4m = "purple"),
Treatment = c(High_P = "steelblue", Low_P = "coral")
)
pheatmap(
t(geneGT[subset_genes, subset_samples]), # Transpose: samples as rows, genes as columns
cluster_rows = FALSE,
cluster_cols = FALSE,
annotation_row = sample_annot, # Changed from annotation_col
annotation_colors = annot_colors,
color = c("white", "lightblue", "darkblue"),
breaks = c(-0.5, 0.5, 1.5, 2.5),
legend_breaks = c(0, 1, 2),
legend_labels = c("0 (B73/B73)", "1 (Het)", "2 (Alt/Alt)"),
main = "Gene-Level Genotypes Vary Per Gene\n(Recombination creates within-sample variation)",
fontsize_row = 8, # Swapped: samples now rows
fontsize_col = 6, # Swapped: genes now columns (vertical labels by default)
angle_col = 90 # Rotate gene labels to vertical
)
Purpose: Demonstrate the correct approach for gene-specific predictors: fit a separate linear model for each gene.
Equation: For gene \(g\): \(\log_2(\text{expression}_g) = \beta_0 + \beta_1 \cdot \text{Treatment} + \beta_2 \cdot \text{geneGT}_g + \epsilon\)
Approach: Use MatrixEQTL to efficiently test all gene-genotype pairs, then extract diagonal (cis) effects where each gene is tested against its own genotype.
# Normalize counts for linear modeling (log2 CPM)
lib_size <- colSums(counts)
log_counts <- log2(t(t(counts) / lib_size * 1e6) + 1)
# Subset to genes with genotype variation
expr_eqtl <- log_counts[genes_with_variation, ]
gt_eqtl <- geneGT[genes_with_variation, ]
# Prepare SlicedData objects for MatrixEQTL
expr_slice <- SlicedData$new()
expr_slice$CreateFromMatrix(as.matrix(expr_eqtl))
geno_slice <- SlicedData$new()
geno_slice$CreateFromMatrix(as.matrix(gt_eqtl))
# Covariates: Treatment (numeric: High_P=0, Low_P=1)
cvrt_matrix <- matrix(
as.numeric(metadata$Treatment == "Low_P"),
nrow = 1,
dimnames = list("Treatment", rownames(metadata))
)
cvrt_slice <- SlicedData$new()
cvrt_slice$CreateFromMatrix(cvrt_matrix)
time_eqtl <- system.time({
me_cis <- Matrix_eQTL_engine(
snps = geno_slice,
gene = expr_slice,
cvrt = cvrt_slice,
output_file_name = NULL,
pvOutputThreshold = 1, # Return all results
useModel = modelLINEAR,
errorCovariance = numeric(),
verbose = FALSE,
pvalue.hist = FALSE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE
)
})
# Extract cis effects (diagonal: gene tested against its own genotype)
all_eqtls <- me_cis$all$eqtls
all_eqtls$snps <- as.character(all_eqtls$snps)
all_eqtls$gene <- as.character(all_eqtls$gene)
cis_eqtls <- all_eqtls %>%
filter(snps == gene) %>%
select(gene, geneGT_beta = beta, geneGT_pval = pvalue, geneGT_fdr = FDR)
# Diagnostics
{
cat("Ran MatrixEQTL on", length(genes_with_variation), "genes...\n")
cat("Completed in", round(time_eqtl["elapsed"], 1), "seconds\n\n")
cat("Total pairs tested:", format(nrow(all_eqtls), big.mark = ","), "\n")
cat("Cis pairs (diagonal):", nrow(cis_eqtls), "\n")
cat("Significant cis-eQTLs (FDR < 0.05):", sum(cis_eqtls$geneGT_fdr < 0.05), "\n")
}
## Ran MatrixEQTL on 5890 genes...
## Completed in 15.2 seconds
##
## Total pairs tested: 34,692,100
## Cis pairs (diagonal): 5890
## Significant cis-eQTLs (FDR < 0.05): 12
Purpose: Build effect and p-value matrices from MatrixEQTL results for visualization. The analysis was already run above in ~11 seconds using MatrixEQTL (vs ~1 hour with lm() loops).
Interpretation: - Rows = target genes (expression) - Columns = source genes (genotype) - Diagonal = cis effects (a gene’s own genotype affects its expression) - Off-diagonal = trans effects (one gene’s genotype affects another gene)
# Create matrices more efficiently using vectorized operations instead of loops
# all_eqtls contains: snps (source), gene (target), beta, pvalue
n_genes <- length(genes_with_variation)
# Initialize matrices
effect_matrix_full <- matrix(
NA_real_,
nrow = n_genes,
ncol = n_genes,
dimnames = list(genes_with_variation, genes_with_variation)
)
pval_matrix_full <- matrix(
NA_real_,
nrow = n_genes,
ncol = n_genes,
dimnames = list(genes_with_variation, genes_with_variation)
)
# Create gene index mapping for vectorized assignment
gene_index <- setNames(seq_along(genes_with_variation), genes_with_variation)
# Get indices for target (rows) and source (cols) genes
target_indices <- gene_index[all_eqtls$gene] # target genes (expression)
source_indices <- gene_index[all_eqtls$snps] # source genes (genotype)
# Only assign to valid indices (both source and target in our gene set)
valid <- !is.na(target_indices) & !is.na(source_indices)
# Vectorized assignment - much faster than the loop
effect_matrix_full[cbind(target_indices[valid], source_indices[valid])] <- all_eqtls$beta[valid]
pval_matrix_full[cbind(target_indices[valid], source_indices[valid])] <- all_eqtls$pvalue[valid]
# Save results for potential reuse
saveRDS(
effect_matrix_full,
file.path(
here::here("output", "DEG_vs_eQTL_tutorial"),
"effect_matrix_full.rds"
)
)
saveRDS(
pval_matrix_full,
file.path(
here::here("output", "DEG_vs_eQTL_tutorial"),
"pval_matrix_full.rds"
)
)
# Diagnostics
{
cat("=== Built Effect Matrices from MatrixEQTL Results ===\n\n")
cat("Dimensions:", dim(effect_matrix_full), "\n")
cat("Effect range:", round(range(effect_matrix_full, na.rm = TRUE), 3), "\n")
cat("Mean absolute effect:",
round(mean(abs(effect_matrix_full), na.rm = TRUE), 4), "\n")
# Count significant effects at various thresholds
cat("\nSignificant effects (p < 0.05):",
format(sum(pval_matrix_full < 0.05, na.rm = TRUE), big.mark = ","), "\n")
cat("Significant effects (p < 0.01):",
format(sum(pval_matrix_full < 0.01, na.rm = TRUE), big.mark = ","), "\n")
cat("Significant effects (p < 0.001):",
format(sum(pval_matrix_full < 0.001, na.rm = TRUE), big.mark = ","), "\n")
cat("\nSaved: effect_matrix_full.rds, pval_matrix_full.rds\n")
}
## === Built Effect Matrices from MatrixEQTL Results ===
##
## Dimensions: 5890 5890
## Effect range: -5.324 5.324
## Mean absolute effect: 0.4164
##
## Significant effects (p < 0.05): 2,051,786
## Significant effects (p < 0.01): 553,359
## Significant effects (p < 0.001): 115,918
##
## Saved: effect_matrix_full.rds, pval_matrix_full.rds
Purpose: Visualize the significance (-log10 p) of all pairwise eQTL tests using a sparse matrix approach. Genes are ordered by chromosome position so cis effects appear near the diagonal.
# 1. Identify FDR-significant eQTLs (vectorized)
all_eqtls$neglogp <- -log10(all_eqtls$pvalue)
sig_eqtls <- all_eqtls %>% filter(FDR < 0.05)
# 2. Define genome-ordered gene axis
gene_pos_df <- data.frame(
gene_id = genes_annot$gene_id,
chr = as.character(seqnames(genes_annot)),
pos = start(genes_annot),
stringsAsFactors = FALSE
)
gene_order_df <- gene_pos_df %>%
filter(gene_id %in% genes_with_variation, chr %in% as.character(1:10)) %>%
mutate(chr_num = as.integer(chr)) %>%
arrange(chr_num, pos)
gene_order <- gene_order_df$gene_id
gene_index <- setNames(seq_along(gene_order), gene_order)
# Calculate chromosome boundaries in gene index space
# Create a mapping from gene_id to its position in the ordered list
gene_to_idx <- setNames(1:length(gene_order), gene_order)
# Add index position to gene_order_df
gene_order_df$idx <- gene_to_idx[gene_order_df$gene_id]
# Calculate chromosome boundaries
chr_boundaries <- gene_order_df %>%
group_by(chr) %>%
summarize(
start_idx = min(idx),
end_idx = max(idx),
.groups = "drop"
) %>%
mutate(
center_idx = (start_idx + end_idx) / 2,
chr_num = as.integer(chr)
) %>%
arrange(chr_num)
# 4. Construct sparse binary significance matrix
# Rows = target (expression), Columns = source (genotype)
i_idx <- gene_index[sig_eqtls$gene]
j_idx <- gene_index[sig_eqtls$snps]
valid <- !is.na(i_idx) & !is.na(j_idx)
sig_matrix <- sparseMatrix(
i = i_idx[valid],
j = j_idx[valid],
x = 1,
dims = c(length(gene_order), length(gene_order))
)
# 5. Plot full genome-wide significance matrix
par(mar = c(5, 5, 4, 2))
# Check if we have significant eQTLs to plot and matrix has dimensions
has_sig_eqtls <- (length(sig_eqtls) > 0 && nrow(sig_eqtls) > 0 && sum(sig_matrix) > 0)
has_genes_to_plot <- nrow(sig_matrix) > 0 && ncol(sig_matrix) > 0
{
# Debug information
cat("Matrix dimensions:", nrow(sig_matrix), "x", ncol(sig_matrix), "\n")
cat("Non-zero elements in matrix:", sum(sig_matrix), "\n")
cat("Number of significant eQTLs (FDR < 0.05):", nrow(sig_eqtls), "\n")
}
## Matrix dimensions: 5890 x 5890
## Non-zero elements in matrix: 19404
## Number of significant eQTLs (FDR < 0.05): 19404
if (has_sig_eqtls && has_genes_to_plot) {
tryCatch({
# Convert sparse matrix to regular matrix for image function
# But limit the size for display purposes if too large
image_matrix <- as.matrix(sig_matrix)
# Check if matrix is too large to display effectively
# If too large, we might want to show a subset or use a different visualization
if(nrow(image_matrix) > 2000 || ncol(image_matrix) > 2000) {
cat("Matrix too large for full display. Consider showing a subset or summary.\n")
# For very large matrices, we can still plot them with image() but it might be pixelated
}
# Create the image plot
image(
image_matrix,
col = c("white", "red"), # 0 values are white, 1 values are red
useRaster = TRUE,
axes = FALSE,
xlab = "Source Gene (Genotype) - Chromosome",
ylab = "Target Gene (Expression) - Chromosome",
main = "Genome-wide eQTL Significance Matrix\nRed = FDR < 0.05"
)
# Add chromosome boundary lines (vertical and horizontal)
n_genes <- nrow(image_matrix)
for (i in 1:nrow(chr_boundaries)) {
# Normalize indices to [0, 1] for image coordinates
boundary_pos <- chr_boundaries$end_idx[i] / n_genes
abline(v = boundary_pos, col = "gray40", lty = 2, lwd = 0.5)
abline(h = boundary_pos, col = "gray40", lty = 2, lwd = 0.5)
}
# Add X-axis labels (chromosome numbers at centers)
axis(1,
at = chr_boundaries$center_idx / n_genes,
labels = chr_boundaries$chr,
tick = FALSE,
cex.axis = 0.9)
# Add Y-axis labels (chromosome numbers at centers, rotated 90°)
axis(2,
at = chr_boundaries$center_idx / n_genes,
labels = chr_boundaries$chr,
tick = FALSE,
las = 2, # las = 2 rotates labels 90 degrees
cex.axis = 0.9)
# Add box around the plot
box()
cat("Plot created successfully!\n")
}, error = function(e) {
# If image fails, create an alternative plot
n_genes <- max(1, nrow(sig_matrix))
plot(1, type="n",
xlab = "Source Gene (Genotype) - Chromosome",
ylab = "Target Gene (Expression) - Chromosome",
main = "Genome-wide eQTL Significance Matrix\nRed = FDR < 0.05",
xlim = c(0, n_genes),
ylim = c(0, n_genes))
# Add chromosome boundary lines for the fallback plot too
for (i in 1:nrow(chr_boundaries)) {
boundary_pos <- chr_boundaries$end_idx[i] / n_genes
abline(v = boundary_pos, col = "gray40", lty = 2, lwd = 0.5)
abline(h = boundary_pos, col = "gray40", lty = 2, lwd = 0.5)
}
# Add axis labels for fallback plot
if(nrow(chr_boundaries) > 0) {
axis(1,
at = chr_boundaries$center_idx / n_genes,
labels = chr_boundaries$chr,
tick = FALSE,
cex.axis = 0.9)
axis(2,
at = chr_boundaries$center_idx / n_genes,
labels = chr_boundaries$chr,
tick = FALSE,
las = 2,
cex.axis = 0.9)
box()
}
text(n_genes/2, n_genes/2,
paste("Plot error:", e$message),
cex = 1.0, col = "red")
cat("Plot failed with error:", e$message, "\n")
})
} else {
# Create an empty plot with appropriate title and labels
n_genes <- max(1, nrow(sig_matrix))
plot(1, type="n",
xlab = "Source Gene (Genotype) - Chromosome",
ylab = "Target Gene (Expression) - Chromosome",
main = "Genome-wide eQTL Significance Matrix\nRed = FDR < 0.05",
xlim = c(0, n_genes),
ylim = c(0, n_genes))
text(n_genes/2, n_genes/2,
"No significant eQTLs pass FDR < 0.05 threshold",
cex = 1.2, col = "red")
cat("No plot generated - no significant eQTLs pass threshold\n")
}
## Matrix too large for full display. Consider showing a subset or summary.
## Plot created successfully!
Purpose: Find the gene whose genotype has the most significant effect on other genes’ expression (excluding chromosome 4), and visualize its trans effects across the genome.
# Get gene positions
gene_pos <- data.frame(
gene_id = genes_annot$gene_id,
chr = as.character(seqnames(genes_annot)),
pos = start(genes_annot),
stringsAsFactors = FALSE
)
# Filter to genes that exist in both position info and cached matrix
genes_in_matrix <- colnames(pval_matrix_full)
genes_with_pos <- intersect(genes_in_matrix, gene_pos$gene_id)
# Exclude chromosome 4 source genes
chr4_genes <- gene_pos$gene_id[gene_pos$chr == "4"]
source_genes_no_chr4 <- setdiff(genes_with_pos, chr4_genes)
{
cat("=== Finding Top eQTL Gene (Excluding Chr 4) ===\n\n")
cat("Genes in cached matrix:", length(genes_in_matrix), "\n")
cat("Source genes on chr 4:", length(intersect(genes_in_matrix, chr4_genes)), "\n")
cat("Source genes excluding chr 4:", length(source_genes_no_chr4), "\n")
}
## === Finding Top eQTL Gene (Excluding Chr 4) ===
##
## Genes in cached matrix: 5890
## Source genes on chr 4: 899
## Source genes excluding chr 4: 4991
# For each source gene (excluding chr4), find its minimum p-value
# across all target genes
source_stats <- data.frame(
source_gene = source_genes_no_chr4,
min_pval = vapply(source_genes_no_chr4, function(g) {
min(pval_matrix_full[, g], na.rm = TRUE)
}, numeric(1)),
max_abs_effect = vapply(source_genes_no_chr4, function(g) {
max(abs(effect_matrix_full[, g]), na.rm = TRUE)
}, numeric(1)),
stringsAsFactors = FALSE
)
# Sort by p-value first, then by absolute effect
source_stats <- source_stats %>%
arrange(min_pval, desc(max_abs_effect))
top_source_gene <- source_stats$source_gene[1]
top_source_pval <- source_stats$min_pval[1]
top_source_effect <- source_stats$max_abs_effect[1]
# Get chromosome of top gene
top_gene_chr <- gene_pos$chr[gene_pos$gene_id == top_source_gene]
top_gene_pos <- gene_pos$pos[gene_pos$gene_id == top_source_gene]
{
cat("Location: chr", top_gene_chr, "pos",
format(top_gene_pos, big.mark = ","), "\n")
cat("\nTop source gene:", top_source_gene, "\n")
cat("Minimum p-value:", format(top_source_pval, scientific = TRUE), "\n")
cat("Maximum absolute effect:", round(top_source_effect, 3), "\n")
}
## Location: chr 1 pos 200,538,436
##
## Top source gene: Zm00001eb037190
## Minimum p-value: 9.731841e-12
## Maximum absolute effect: 2.354
manhattan_data <- data.frame(
target_gene = genes_with_pos,
effect = effect_matrix_full[genes_with_pos, top_source_gene],
pval = pval_matrix_full[genes_with_pos, top_source_gene],
stringsAsFactors = FALSE
)
manhattan_data <- manhattan_data %>%
left_join(gene_pos, by = c("target_gene" = "gene_id")) %>%
filter(chr %in% as.character(1:10)) %>%
mutate(chr = factor(chr, levels = as.character(1:10))) %>%
arrange(chr, pos)
# Calculate cumulative positions
chr_info <- manhattan_data %>%
group_by(chr) %>%
summarize(chr_len = max(pos), .groups = "drop") %>%
mutate(chr_offset = lag(cumsum(chr_len), default = 0))
manhattan_data <- manhattan_data %>%
left_join(chr_info, by = "chr") %>%
mutate(cumpos = pos + chr_offset)
chr_centers <- manhattan_data %>%
group_by(chr) %>%
summarize(center = mean(cumpos), .groups = "drop")
# Mark if target is the source gene itself (cis)
manhattan_data$is_cis <- manhattan_data$target_gene == top_source_gene
# Create Manhattan plot
ggplot(manhattan_data, aes(x = cumpos, y = -log10(pval))) +
geom_point(aes(color = chr), alpha = 0.6, size = 1) +
geom_point(data = filter(manhattan_data, is_cis),
aes(y = -0.5), color = "red", size = 3, shape = 17) +
scale_color_manual(values = rep(c("steelblue", "coral"), 5), guide = "none") +
scale_x_continuous(breaks = chr_centers$center, labels = chr_centers$chr) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "gray50") +
geom_hline(yintercept = -log10(0.05 / nrow(manhattan_data)),
linetype = "dashed", color = "red") +
labs(
title = paste("Trans-eQTL Manhattan Plot: Effects of", top_source_gene,
"genotype on all genes"),
subtitle = paste("Source gene on chr", top_gene_chr,
"| Red triangle = cis effect"),
x = "Target Gene Chromosome",
y = "-log10(p-value)",
caption = "Gray line = p < 0.05 | Red line = Bonferroni threshold"
) +
theme_bw() +
theme(panel.grid.minor = element_blank())
# Find top trans effects (excluding diagonal/cis) using the already available all_eqtls data
# This avoids creating a huge dense matrix representation
# Filter to trans effects (source != target gene) and remove NAs
trans_effects <- all_eqtls %>%
filter(snps != gene) %>% # Exclude cis effects (diagonal)
filter(!is.na(pvalue)) %>% # Remove NAs
select(source = snps, target = gene, effect = beta, pval = pvalue)
# Add chromosome info (use dplyr:: to avoid S4Vectors conflict)
trans_effects <- trans_effects %>%
left_join(gene_pos %>%
dplyr::select(gene_id, chr) %>%
dplyr::rename(target_chr = chr),
by = c("target" = "gene_id")) %>%
left_join(gene_pos %>%
dplyr::select(gene_id, chr) %>%
dplyr::rename(source_chr = chr),
by = c("source" = "gene_id"))
# Sort by p-value, then absolute effect
trans_effects <- trans_effects %>%
arrange(pval, desc(abs(effect)))
# Same-chromosome vs different-chromosome
trans_effects$same_chr <- trans_effects$target_chr == trans_effects$source_chr
{
cat("=== Top 20 Trans-eQTL Effects ===\n\n")
print(head(trans_effects %>%
dplyr::select(source, source_chr, target, target_chr, effect, pval) %>%
mutate(effect = round(effect, 3),
pval = format(pval, digits = 3, scientific = TRUE)),
20))
# Summary statistics
cat("\n=== Trans-eQTL Summary ===\n")
cat("Total trans pairs tested:", nrow(trans_effects), "\n")
cat("Significant trans effects (p < 0.05):", sum(trans_effects$pval < 0.05), "\n")
cat("Significant trans effects (p < 0.001):", sum(trans_effects$pval < 0.001), "\n")
cat("\nSame chromosome trans effects (p < 0.05):",
sum(trans_effects$pval < 0.05 & trans_effects$same_chr, na.rm = TRUE), "\n")
cat("Different chromosome trans effects (p < 0.05):",
sum(trans_effects$pval < 0.05 & !trans_effects$same_chr, na.rm = TRUE), "\n")
}
## === Top 20 Trans-eQTL Effects ===
##
## source source_chr target target_chr effect pval
## 1 Zm00001eb187450 4 Zm00001eb189910 4 -1.242 1.93e-12
## 2 Zm00001eb192350 4 Zm00001eb189910 4 -1.242 1.93e-12
## 3 Zm00001eb192590 4 Zm00001eb189910 4 -1.242 1.93e-12
## 4 Zm00001eb194640 4 Zm00001eb189910 4 -1.242 1.93e-12
## 5 Zm00001eb187760 4 Zm00001eb189910 4 -1.201 4.83e-12
## 6 Zm00001eb188700 4 Zm00001eb189910 4 -1.201 4.83e-12
## 7 Zm00001eb188960 4 Zm00001eb189910 4 -1.201 4.83e-12
## 8 Zm00001eb192140 4 Zm00001eb189910 4 -1.201 4.83e-12
## 9 Zm00001eb037190 1 Zm00001eb189910 4 -2.354 9.73e-12
## 10 Zm00001eb059620 1 Zm00001eb189910 4 -2.354 9.73e-12
## 11 Zm00001eb118150 2 Zm00001eb189910 4 -2.354 9.73e-12
## 12 Zm00001eb213970 5 Zm00001eb189910 4 -2.354 9.73e-12
## 13 Zm00001eb219160 5 Zm00001eb189910 4 -2.354 9.73e-12
## 14 Zm00001eb255880 5 Zm00001eb189910 4 -2.354 9.73e-12
## 15 Zm00001eb258680 5 Zm00001eb189910 4 -2.354 9.73e-12
## 16 Zm00001eb259080 5 Zm00001eb189910 4 -2.354 9.73e-12
## 17 Zm00001eb277680 6 Zm00001eb189910 4 -2.354 9.73e-12
## 18 Zm00001eb338960 8 Zm00001eb189910 4 -2.354 9.73e-12
## 19 Zm00001eb362510 8 Zm00001eb189910 4 -2.354 9.73e-12
## 20 Zm00001eb409780 10 Zm00001eb189910 4 -2.354 9.73e-12
##
## === Trans-eQTL Summary ===
## Total trans pairs tested: 34686210
## Significant trans effects (p < 0.05): 2051374
## Significant trans effects (p < 0.001): 115870
##
## Same chromosome trans effects (p < 0.05): 251962
## Different chromosome trans effects (p < 0.05): 1799412
# Count significant results from current models
# Note: The model structure has been simplified, so we only have the current model results
n_deg_model1_treat <- sum(results1_treatment$adj.P.Val < 0.05)
n_deg_model1_mesa <- sum(results1_mesa$adj.P.Val < 0.05)
n_deg_model1_chal <- sum(results1_chal$adj.P.Val < 0.05)
n_eqtl_gt <- sum(cis_eqtls$geneGT_fdr < 0.05)
summary_df <- data.frame(
Model = c(
"limma ~ Treatment + donor_taxa",
"limma ~ Treatment + geneGT",
"MatrixEQTL (cis)"
),
Treatment_DEGs = c(
n_deg_model1_treat,
"IMPOSSIBLE",
"(covariate)"
),
Genotype_DEGs = c(
paste0(n_deg_model1_mesa, " (Mesa) / ", n_deg_model1_chal, " (Chal)"),
"IMPOSSIBLE",
n_eqtl_gt
),
Notes = c(
"More granular: separate Mesa/Chal effects",
"Cannot vary design per gene",
"~11 sec for 35M tests (vs ~1h lm loop)"
),
check.names = FALSE
)
knitr::kable(summary_df, caption = "Analysis Approaches Compared")
| Model | Treatment_DEGs | Genotype_DEGs | Notes |
|---|---|---|---|
| limma ~ Treatment + donor_taxa | 154 | 18 (Mesa) / 17 (Chal) | More granular: separate Mesa/Chal effects |
| limma ~ Treatment + geneGT | IMPOSSIBLE | IMPOSSIBLE | Cannot vary design per gene |
| MatrixEQTL (cis) | (covariate) | 12 | ~11 sec for 35M tests (vs ~1h lm loop) |
DEG/limma requires sample-level predictors: The design matrix is shared across all genes, so predictors must have one value per sample.
Inversion status is an imperfect proxy: Using sample-level “Inv4m carrier” status ignores recombination—some Inv4m samples have B73 alleles at certain genes due to recombination breakpoints.
Gene-specific predictors require eQTL approach: When the predictor varies per gene (like local genotype), you must fit a separate model for each gene.
The approaches are complementary:
DEG/limma (one model, all genes): \[\log_2(E_g) = \mathbf{X} \boldsymbol{\beta} + \epsilon_g\]
Where \(\mathbf{X}\) is the same design matrix for all genes \(g\).
eQTL (one model per gene): \[\log_2(E_g) = \mathbf{X}_g \boldsymbol{\beta}_g + \epsilon_g\]
Where \(\mathbf{X}_g\) can differ for each gene \(g\) (e.g., including gene-specific genotype).
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] pheatmap_1.0.13 rtracklayer_1.70.0 GenomicRanges_1.62.0
## [4] Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
## [7] BiocGenerics_0.56.0 generics_0.1.4 tidyr_1.3.1
## [10] ggplot2_4.0.1 dplyr_1.1.4 Matrix_1.7-4
## [13] MatrixEQTL_2.3 edgeR_4.8.0 limma_3.66.0
##
## loaded via a namespace (and not attached):
## [1] SummarizedExperiment_1.40.0 gtable_0.3.6
## [3] rjson_0.2.23 xfun_0.54
## [5] bslib_0.9.0 Biobase_2.70.0
## [7] lattice_0.22-7 vctrs_0.6.5
## [9] tools_4.5.1 bitops_1.0-9
## [11] curl_7.0.0 parallel_4.5.1
## [13] tibble_3.3.0 pkgconfig_2.0.3
## [15] RColorBrewer_1.1-3 cigarillo_1.0.0
## [17] S7_0.2.1 lifecycle_1.0.4
## [19] compiler_4.5.1 farver_2.1.2
## [21] Rsamtools_2.26.0 Biostrings_2.78.0
## [23] statmod_1.5.1 codetools_0.2-20
## [25] htmltools_0.5.9 sass_0.4.10
## [27] RCurl_1.98-1.17 yaml_2.3.12
## [29] pillar_1.11.1 crayon_1.5.3
## [31] jquerylib_0.1.4 BiocParallel_1.44.0
## [33] DelayedArray_0.36.0 cachem_1.1.0
## [35] abind_1.4-8 nlme_3.1-168
## [37] tidyselect_1.2.1 locfit_1.5-9.12
## [39] digest_0.6.39 purrr_1.2.0
## [41] restfulr_0.0.16 splines_4.5.1
## [43] labeling_0.4.3 rprojroot_2.1.1
## [45] fastmap_1.2.0 grid_4.5.1
## [47] SparseArray_1.10.6 here_1.0.2
## [49] cli_3.6.5 magrittr_2.0.4
## [51] S4Arrays_1.10.1 XML_3.99-0.20
## [53] withr_3.0.2 scales_1.4.0
## [55] rmarkdown_2.30 XVector_0.50.0
## [57] httr_1.4.7 matrixStats_1.5.0
## [59] gridExtra_2.3 evaluate_1.0.5
## [61] knitr_1.50 BiocIO_1.20.0
## [63] mgcv_1.9-4 rlang_1.1.6
## [65] glue_1.8.0 rstudioapi_0.17.1
## [67] jsonlite_2.0.0 R6_2.6.1
## [69] MatrixGenerics_1.22.0 GenomicAlignments_1.46.0