Purpose: Plot genotype per NIL. Characterize linkage disequilibrium between the Inv4m chromosomal inversion and genome-wide markers in maize near-isogenic lines.
Approach: Calculate Spearman correlations between a tagging SNP within Inv4m (S4_181558683) and all genotyped markers across the genome. Identify regions with significant LD and visualize genotype structure.
Expected outcome: Identify the physical extent of linkage with Inv4m, detect recombination breakpoints, and characterize flanking introgression segments.
library(vcfR)
library(dplyr)
library(ggplot2)
library(ggtext)
library(rtracklayer)
library(GenomicRanges)
library(cowplot)
Purpose: Establish coordinates for Inv4m inversion and shared introgression.
Approach: Extract gene boundaries from B73 v5.0 annotation to define inversion breakpoints identified in MCScan analysis.
# Load B73 annotation
myGFF <- "/Users/fvrodriguez/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
B73 <- rtracklayer::import(myGFF) %>%
subset(type == "gene" & seqnames %in% 1:10)
genes <- as.data.frame(B73)
genes$ID <- gsub("gene:", "", genes$ID)
# Inv4m boundaries (from MCScan)
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]
# Shared introgression boundaries (empirical)
introgression_start <- 157012149
introgression_end <- 195900523
# Create GRanges objects
inv4m <- GRanges(
seqnames = "4",
ranges = IRanges(start = inv4m_start, end = inv4m_end),
strand = "+"
)
shared_introgression <- GRanges(
seqnames = "4",
ranges = IRanges(start = introgression_start, end = introgression_end),
strand = "+"
)
# Region sizes
inv4m_size <- round((inv4m_end - inv4m_start) / 1e6, 1)
introgression_size <- round((introgression_end - introgression_start) / 1e6, 1)
# Count genes in regions
olap_inv4m <- findOverlaps(inv4m, B73, ignore.strand = TRUE)
inv4m_gene_ids <- genes$ID[subjectHits(olap_inv4m)]
olap_introgression <- findOverlaps(shared_introgression, B73, ignore.strand = TRUE)
shared_introgression_gene_ids <- genes$ID[subjectHits(olap_introgression)]
flanking_introgression_gene_ids <- shared_introgression_gene_ids[
!(shared_introgression_gene_ids %in% inv4m_gene_ids)
]
cat("Inv4m inversion:", inv4m_size, "Mb |", length(inv4m_gene_ids), "genes\n")
## Inv4m inversion: 15.2 Mb | 434 genes
cat("Shared introgression:", introgression_size, "Mb |",
length(shared_introgression_gene_ids), "genes\n")
## Shared introgression: 38.9 Mb | 1101 genes
cat("Flanking regions:",
round(introgression_size - inv4m_size, 1), "Mb |",
length(flanking_introgression_gene_ids), "genes\n")
## Flanking regions: 23.7 Mb | 667 genes
Purpose: Load genotype data and convert to numeric matrix.
Approach: Read VCF, extract GT field, recode genotypes as 0/1/2 for B73/Het/Mi21.
# Read VCF
vcf <- read.vcfR("~/Desktop/inv4m_PSU_imputed.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 10
## header_line: 11
## variant count: 19861
## column count: 22
## Meta line 10 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 19861
## Character matrix gt cols: 22
## skip: 0
## nrows: 19861
## row_num: 0
## Processed variant 1000Processed variant 2000Processed variant 3000Processed variant 4000Processed variant 5000Processed variant 6000Processed variant 7000Processed variant 8000Processed variant 9000Processed variant 10000Processed variant 11000Processed variant 12000Processed variant 13000Processed variant 14000Processed variant 15000Processed variant 16000Processed variant 17000Processed variant 18000Processed variant 19000Processed variant: 19861
## All variants processed
gt <- extract.gt(vcf, element = "GT")
# Recode genotypes
gt[gt == "0/0"] <- 0
gt[gt == "0/1" | gt == "1/0"] <- 1
gt[gt == "1/1"] <- 2
# Convert to numeric matrix
ngt <- matrix(as.integer(gt), ncol = ncol(gt))
colnames(ngt) <- colnames(gt)
rownames(ngt) <- rownames(gt)
# Sample ordering (CTRL then INV4 carriers, alphabetically within group)
sample_order <- c(
sort(c("3095", "3113", "4113", "3083", "4107", "4122")),
sort(c("3092", "3080", "3059", "3131", "4125", "4080", "4131"))
)
ngt <- ngt[, sample_order] %>% t()
# Genotype distribution (by site)
total_sites <- ncol(ngt) * nrow(ngt)
geno_freq <- table(ngt) / total_sites
cat("Genotype frequencies (by site):\n")
## Genotype frequencies (by site):
cat(" B73/B73 (0):", round(geno_freq["0"], 3), "\n")
## B73/B73 (0): 0.636
cat(" B73/Mi21 (1):", round(geno_freq["1"], 3), "\n")
## B73/Mi21 (1): 0.09
cat(" Mi21/Mi21 (2):", round(geno_freq["2"], 3), "\n")
## Mi21/Mi21 (2): 0.273
cat(" Missing:", round(sum(is.na(ngt)) / total_sites, 4), "\n")
## Missing: 0.0016
cat("Dimensions:", nrow(ngt), "samples ×", ncol(ngt), "markers\n")
## Dimensions: 13 samples × 19861 markers
Purpose: Quantify introgression content across samples.
Approach: Calculate run-length encoding of genotypes, sum by allelic state.
get_genotype_runs <- function(x, sample_order) {
result <- data.frame()
chr_levels <- paste0("S", 1:10)
for (ind_idx in sample_order) {
result_ind <- data.frame()
for (chr_idx in 1:10) {
chr_prefix <- paste0("S", chr_idx, "_")
chr_name <- paste0("S", chr_idx)
chr <- x[grepl(chr_prefix, rownames(x), perl = TRUE), ]
runs <- rle(chr[, ind_idx])
chr_long <- data.frame(
ind = ind_idx,
chr = chr_name,
pos = as.integer(gsub(chr_prefix, "", names(runs$values))),
gt = runs$values
) %>%
group_by(ind, chr) %>%
mutate(
lag = lag(pos, default = 0),
span = (pos - lag) / 1e6,
chr = factor(chr, levels = chr_levels),
ind = factor(ind, levels = sample_order),
missing = is.na(gt),
missing_length = missing * span,
B73 = as.numeric(gt == 0),
B73_length = B73 * span,
Het = as.numeric(gt == 1),
Het_length = Het * span,
Mi21 = as.numeric(gt == 2),
Mi21_length = Mi21 * span
)
result_ind <- rbind(result_ind, chr_long)
}
result <- rbind(result, result_ind)
}
result
}
# Calculate runs
m <- t(ngt)
gt_runs <- get_genotype_runs(m, sample_order)
# Add grouping
gt_runs$group <- case_when(
gt_runs$ind %in% c("3092", "3080", "3059", "3131", "4125", "4080", "4131") ~
"Inv4m",
.default = "CTRL"
)
# Genome size
chr_length <- gt_runs %>%
group_by(chr) %>%
summarise(chr_length = max(pos) / 1e6)
genome_size <- sum(chr_length$chr_length)
# Summarize by genotype
by_length <- gt_runs %>%
inner_join(chr_length, by = "chr") %>%
ungroup() %>%
group_by(ind) %>%
summarise(
B73 = sum(B73_length, na.rm = TRUE),
Het = sum(Het_length, na.rm = TRUE),
Mi21 = sum(Mi21_length, na.rm = TRUE),
missing = sum(missing_length, na.rm = TRUE),
group = group[1]
) %>%
mutate(group = factor(group, levels = c("CTRL", "Inv4m"))) %>%
tidyr::pivot_longer(
cols = c("B73", "Het", "Mi21", "missing"),
names_to = "Genotype",
values_to = "length"
) %>%
mutate(pct = 100 * length / genome_size)
cat("Genome size:", round(genome_size, 1), "Mb\n")
## Genome size: 2128.1 Mb
cat("\nGenotype run length summary (Mb):\n")
##
## Genotype run length summary (Mb):
by_length %>%
group_by(Genotype) %>%
summarise(
mean_Mb = round(mean(length), 1),
mean_pct = round(mean(pct), 2)
) %>%
print()
## # A tibble: 4 × 3
## Genotype mean_Mb mean_pct
## <chr> <dbl> <dbl>
## 1 B73 1821. 85.6
## 2 Het 183. 8.61
## 3 Mi21 117. 5.5
## 4 missing 7.3 0.34
cat("\nBy introgression group:\n")
##
## By introgression group:
by_length %>%
group_by(group, Genotype) %>%
summarise(
mean_Mb = round(mean(length), 1),
mean_pct = round(mean(pct), 2),
.groups = "drop"
) %>%
print()
## # A tibble: 8 × 4
## group Genotype mean_Mb mean_pct
## <fct> <chr> <dbl> <dbl>
## 1 CTRL B73 1849. 86.9
## 2 CTRL Het 170 7.99
## 3 CTRL Mi21 102 4.79
## 4 CTRL missing 6.8 0.32
## 5 Inv4m B73 1796. 84.4
## 6 Inv4m Het 195. 9.15
## 7 Inv4m Mi21 130. 6.1
## 8 Inv4m missing 7.8 0.37
Purpose: Create chromosome-wide haplotype plots.
Approach: Plot genotype runs as colored bars across chromosomes.
# Genotype color breaks
breaks <- c(0, 1.5, 2.5)
names(breaks) <- c("B73/B73", "B73/Mi21", "Mi21/Mi21")
# Selection marker triangle
triangle <- data.frame(
pos = 0,
span = 181.6,
ind = sample_order[length(sample_order)],
chr = "4",
label = factor("Selection marker<br>group")
)
# Coordinate limits
inversion_limit <- round(c(inv4m_start, inv4m_end) / 1e6, 2)
introgression_limit <- round(c(introgression_start, introgression_end) / 1e6, 2)
gt_plot <- gt_runs %>%
ungroup() %>%
mutate(chr = gsub("S", "", chr)) %>%
ggplot(aes(y = ind, x = span, fill = gt)) +
geom_col(width = 1) +
scale_fill_viridis_b(
breaks = breaks,
limits = c(-0.5, 2.5),
show.limits = TRUE,
na.value = "gray85",
direction = -1
) +
geom_point(
data = triangle,
mapping = aes(shape = label),
position = position_nudge(y = 2),
size = 1,
color = "#43137D",
fill = "#43137D"
) +
scale_shape_manual(values = 25) +
facet_wrap(factor(chr, 1:10) ~ ., strip.position = "left", ncol = 1) +
coord_cartesian(xlim = c(0, 450), expand = FALSE) +
xlab("Position [Mb]") +
ylab("Chromosome") +
labs(fill = "Genotype") +
guides(
fill = guide_colorsteps(
order = 1,
label.position = "left",
label.hjust = 1,
theme = theme(
legend.title = element_text(size = 8, hjust = 0),
legend.text = element_text(size = 8, vjust = 1.5),
legend.key.width = unit(1, "lines"),
legend.key.height = unit(2.7, "lines")
)
),
shape = guide_legend(
order = 2,
override.aes = list(size = 4),
label.position = "left",
label.hjust = 1,
theme = theme(
legend.title = element_blank(),
legend.text = element_markdown(size = 8),
legend.key.width = unit(1, "lines")
)
)
) +
ggpubr::theme_classic2(base_size = 4) +
theme(
legend.position = c(0.9, 0.85),
legend.spacing.y = unit(0.1, "lines"),
legend.box.just = "right",
strip.background = element_blank(),
strip.text.y.left = element_text(size = 12, angle = 0),
axis.text = element_text(size = 12),
axis.title.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()
)
gt_plot
# Chr4 detailed inset
inlay <- gt_runs %>%
filter(chr == "S4") %>%
ungroup() %>%
mutate(chr = gsub("S", "", chr)) %>%
ggplot(aes(x = span, y = ind, fill = gt)) +
geom_col(width = 1) +
scale_fill_viridis_b(
breaks = breaks,
limits = c(-0.5, 2.5),
show.limits = TRUE,
na.value = "gray85",
direction = -1
) +
annotate("point",
x = 181.6, y = 14.4,
shape = 25, size = 4, color = "#43137D", fill = "#43137D"
) +
annotate("text",
x = c(inversion_limit, introgression_limit), y = 14.4,
label = rep("|", 4), size = 4, color = "#43137D"
) +
coord_cartesian(xlim = c(150, 200)) +
xlab("Chromosome 4") +
ggpubr::theme_classic2(base_size = 8) +
theme(
legend.position = "none",
strip.background = element_blank(),
strip.text = element_blank(),
plot.title = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
plot.background = element_rect(fill = "transparent", color = NA)
)
# Combine plots
plot_with_inset <- ggdraw() +
draw_plot(gt_plot) +
draw_plot(inlay,
x = 0.45, y = 0.03,
width = 0.6, height = 0.6,
scale = 0.8
)
plot_with_inset
# Background means
bg_mean <- by_length %>%
ungroup() %>%
group_by(Genotype) %>%
summarise_at(.vars = c("length", "pct"), .funs = mean)
p_genotype <- by_length %>%
ggplot(aes(x = group, y = pct)) +
geom_hline(data = bg_mean, aes(yintercept = pct), linetype = 2) +
geom_boxplot(outlier.colour = "white", width = 0.25) +
ggbeeswarm::geom_quasirandom(size = 2) +
facet_wrap(. ~ Genotype, scales = "free_y", ncol = 4) +
xlab("Introgression group") +
ylab("% of genome length") +
ggpubr::theme_classic2(base_size = 20) +
theme(strip.background = element_blank())
p_genotype
Purpose: Quantify correlation between tagging SNP and all genome-wide markers.
Approach: Spearman correlation with S4_181558683 (within Inv4m). FDR correction for multiple testing.
# Calculate correlations
tagging_snp <- "S4_181558683"
cor_matrix <- cor(
ngt[, tagging_snp, drop = FALSE],
ngt,
use = "pairwise.complete.obs"
) %>% t()
# Statistical testing
cor_pvalues <- apply(ngt, 2, function(x) {
cor.test(x, y = ngt[, tagging_snp], method = "spearman")$p.value
})
# Combine results
significance_threshold <- 0.01
inv4m_cor <- data.frame(
marker = rownames(cor_matrix),
r = cor_matrix[, 1]
) %>%
tidyr::separate(marker, into = c("CHR", "BP"), sep = "_") %>%
mutate(
BP = as.numeric(BP),
CHR = gsub("S", "", CHR) %>% factor(levels = as.character(1:10)),
p = cor_pvalues,
FDR = p.adjust(p, method = "fdr"),
R2 = r^2,
is_significant = FDR < significance_threshold
) %>%
tibble::rownames_to_column("ID")
cat("Total markers:", nrow(inv4m_cor), "\n")
## Total markers: 19861
cat("Significant (FDR <", significance_threshold, "):",
sum(inv4m_cor$is_significant, na.rm = TRUE),
sprintf("(%.1f%%)", 100 * mean(inv4m_cor$is_significant, na.rm = TRUE)), "\n")
## Significant (FDR < 0.01 ): 7780 (45.4%)
cat("Perfect LD (R² = 1):", sum(inv4m_cor$R2 == 1, na.rm = TRUE), "\n")
## Perfect LD (R² = 1): 3758
hist(inv4m_cor$R2,
main = "R² distribution with Inv4m tagging marker\n19,861 markers total",
xlab = "R²",
ylim = c(0, 8000),
col = "skyblue",
breaks = 30)
Purpose: Identify genes overlapping Inv4m-correlated regions to create filter list for downstream analyses.
Approach: Use GenomicRanges::findOverlaps to map significant SNPs to gene annotations. Classify genes by location: inside shared introgression, dispersed background, or anti-correlated upstream.
Expected outcome: Generate comprehensive list of genes to exclude from analyses aimed at identifying trans-regulatory effects of Inv4m.
# Load transcript annotations
transcripts_gff <- rtracklayer::import(myGFF) %>%
subset(type == "mRNA" & seqnames %in% 1:10)
transcripts <- as.data.frame(transcripts_gff)
transcripts$gene_id <- gsub("transcript:", "", transcripts$ID)
transcripts$gene_id <- gsub("_.*", "", transcripts$gene_id, perl = TRUE)
# Convert SNP sets to GRanges for overlap
outside_correlated_gr <- with(outside_correlated, {
GRanges(
seqnames = CHR,
ranges = IRanges(start = BP, end = BP),
ID = ID
)
})
# 1. Outside shared introgression (dispersed background)
olap_outside <- findOverlaps(outside_correlated_gr, transcripts_gff,
ignore.strand = TRUE)
outside_correlated_genes <- transcripts$gene_id[subjectHits(olap_outside)] %>%
sort() %>%
unique()
cat("Outside shared introgression:\n")
## Outside shared introgression:
cat(" SNPs:", length(outside_correlated_gr), "\n")
## SNPs: 382
cat(" Unique SNP positions:", length(unique(queryHits(olap_outside))), "\n")
## Unique SNP positions: 286
cat(" Genes overlapped:", length(outside_correlated_genes), "\n")
## Genes overlapped: 58
# 2. Anti-correlated upstream region (154.3-157.0 Mb)
olap_anti <- findOverlaps(anticorrelated, transcripts_gff,
ignore.strand = TRUE)
anticorrelated_genes <- transcripts$gene_id[subjectHits(olap_anti)] %>%
sort() %>%
unique()
cat("\nAnti-correlated upstream (154.3-157.0 Mb):\n")
##
## Anti-correlated upstream (154.3-157.0 Mb):
cat(" SNPs:", length(anticorrelated), "\n")
## SNPs: 207
cat(" Unique SNP positions:", length(unique(queryHits(olap_anti))), "\n")
## Unique SNP positions: 177
cat(" Genes overlapped:", length(anticorrelated_genes), "\n")
## Genes overlapped: 14
# 3. Inside shared introgression (157.0-195.9 Mb, 35 Mb)
olap_inside <- findOverlaps(inside_correlated, transcripts_gff,
ignore.strand = TRUE)
inside_correlated_genes <- transcripts$gene_id[subjectHits(olap_inside)] %>%
sort() %>%
unique()
cat("\nInside shared introgression (157.0-195.9 Mb):\n")
##
## Inside shared introgression (157.0-195.9 Mb):
cat(" SNPs:", length(inside_correlated), "\n")
## SNPs: 5984
cat(" Unique SNP positions:", length(unique(queryHits(olap_inside))), "\n")
## Unique SNP positions: 5666
cat(" Genes overlapped:", length(inside_correlated_genes), "\n")
## Genes overlapped: 402
# 4. Create filter list and classification table
# Only filter genes OUTSIDE the introgression that are correlated
# Note: anticorrelated genes are a SUBSET of outside_correlated genes
# (they're outside the introgression, specifically in the upstream region)
# Filter list is simply all outside genes (includes anticorrelated as subset)
outside_inv4m_genes <- outside_correlated_genes
cat("\nGenes to filter out (outside Inv4m but correlated):",
length(outside_inv4m_genes), "\n")
##
## Genes to filter out (outside Inv4m but correlated): 58
cat(" Total outside (includes anti-correlated):", length(outside_correlated_genes), "\n")
## Total outside (includes anti-correlated): 58
cat(" Anti-correlated subset:", length(anticorrelated_genes),
sprintf("(%.1f%% of outside)", 100 * length(anticorrelated_genes) / length(outside_correlated_genes)),
"\n")
## Anti-correlated subset: 14 (24.1% of outside)
# Create classification table for all correlated genes
# Anti-correlated genes get their own label even though they're subset of outside
gene_classification <- data.frame(
gene_id = c(inside_correlated_genes,
outside_correlated_genes),
classification = c(
rep("inside_introgression", length(inside_correlated_genes)),
rep("outside_dispersed", length(outside_correlated_genes))
)
) %>%
distinct() %>%
mutate(
classification = ifelse(
gene_id %in% anticorrelated_genes,
"outside_anticorrelated",
classification
)
)
cat("\nTotal genes correlated with Inv4m:", nrow(gene_classification), "\n")
##
## Total genes correlated with Inv4m: 460
cat(" Inside introgression:", sum(gene_classification$classification == "inside_introgression"), "\n")
## Inside introgression: 402
cat(" Outside dispersed:", sum(gene_classification$classification == "outside_dispersed"), "\n")
## Outside dispersed: 44
cat(" Outside anti-correlated:", sum(gene_classification$classification == "outside_anticorrelated"), "\n")
## Outside anti-correlated: 14
# Save filter list (only outside genes)
saveRDS(outside_inv4m_genes, "inv4m_linked_genes_to_filter.rds")
write.table(
data.frame(gene_id = outside_inv4m_genes),
file = "inv4m_linked_genes_to_filter.txt",
row.names = FALSE,
quote = FALSE
)
# Save complete classification table
write.table(
gene_classification,
file = "inv4m_correlated_genes_classification.txt",
row.names = FALSE,
quote = FALSE,
sep = "\t"
)
saveRDS(gene_classification, "inv4m_correlated_genes_classification.rds")
cat("\nFiles saved:\n")
##
## Files saved:
cat(" Filter list (outside genes only):\n")
## Filter list (outside genes only):
cat(" - inv4m_linked_genes_to_filter.rds\n")
## - inv4m_linked_genes_to_filter.rds
cat(" - inv4m_linked_genes_to_filter.txt\n")
## - inv4m_linked_genes_to_filter.txt
cat(" Classification table (all correlated genes):\n")
## Classification table (all correlated genes):
cat(" - inv4m_correlated_genes_classification.txt\n")
## - inv4m_correlated_genes_classification.txt
cat(" - inv4m_correlated_genes_classification.rds\n")
## - inv4m_correlated_genes_classification.rds
# Example showing how to filter DE results to exclude Inv4m-linked genes
# This removes genes outside Inv4m that show linkage (trans-effects/background)
# Load your DE results (example structure)
# de_results <- read.csv("differential_expression_results.csv")
# Filter out genes outside Inv4m that are correlated with it
de_filtered <- de_results %>%
filter(!gene %in% outside_inv4m_genes)
# Compare before/after
cat("DE results before filtering:", nrow(de_results), "genes\n")
cat("DE results after filtering:", nrow(de_filtered), "genes\n")
cat("Genes removed:", nrow(de_results) - nrow(de_filtered), "\n")
# This filtered set excludes background linkage effects
# Genes inside Inv4m are retained (cis-regulatory effects of interest)
Purpose: Compare genome-wide vs. Inv4m-linked marker distributions.
Approach: Calculate chromosome-specific proportions for all SNPs and significant Inv4m-correlated SNPs.
inv4_distro <- data.frame(
CHR = factor(1:10, levels = 1:10),
all = with(inv4m_cor, table(CHR)) %>% as.numeric(),
inv4m = with(inv4m_cor %>% filter(is_significant), table(CHR)) %>%
as.numeric()
) %>%
mutate(
all_pct = 100 * all / sum(all),
inv4m_pct = 100 * inv4m / sum(inv4m)
)
cat("SNP distribution:\n")
## SNP distribution:
cat(" Chr4 contains", inv4_distro$all[4], "SNPs (",
round(inv4_distro$all_pct[4], 1), "% of total)\n")
## Chr4 contains 8892 SNPs ( 44.8 % of total)
cat(" Chr4 Inv4m-linked:", inv4_distro$inv4m[4], "SNPs (",
round(inv4_distro$inv4m_pct[4], 1), "% of significant)\n")
## Chr4 Inv4m-linked: 7623 SNPs ( 98 % of significant)
cat(" Other chromosomes average:",
round(mean(inv4_distro$all[-4]), 0), "SNPs per chromosome\n")
## Other chromosomes average: 1219 SNPs per chromosome
p_distribution <- inv4_distro %>%
tidyr::pivot_longer(
cols = c("all_pct", "inv4m_pct"),
names_to = "marker_type",
values_to = "pct"
) %>%
ggplot(aes(y = pct, x = CHR, group = marker_type, fill = marker_type)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(
labels = c(
sprintf("All SNPs (n=%s)", format(sum(inv4_distro$all), big.mark = ",")),
sprintf("Inv4m-correlated (n=%s)", format(sum(inv4_distro$inv4m), big.mark = ","))
),
values = c("gold", "purple4")
) +
ylab("%") +
ggpubr::theme_classic2(base_size = 20) +
theme(
legend.position = c(0.7, 0.8),
legend.title = element_blank(),
axis.title.x = element_blank()
)
p_distribution
Purpose: Visualize genome-wide LD patterns.
Approach: Plot -log10(FDR) and correlation coefficients across chromosomes.
# Calculate cumulative positions
manhattan <- inv4m_cor %>%
group_by(CHR) %>%
summarise(chr_len = max(BP)) %>%
mutate(tot = cumsum(chr_len) - chr_len) %>%
select(-chr_len) %>%
left_join(inv4m_cor, ., by = "CHR") %>%
arrange(CHR, BP) %>%
mutate(BPcum = BP + tot)
# Chromosome centers for x-axis
axisdf <- manhattan %>%
group_by(CHR) %>%
summarize(center = (max(BPcum) + min(BPcum)) / 2)
# Handle zero FDR values
manhattan$FDR[manhattan$FDR == 0] <- 1e-8
p_fdr <- manhattan %>%
ggplot(aes(x = BPcum, y = -log10(FDR))) +
geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
geom_hline(yintercept = -log10(significance_threshold), col = "red") +
scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
coord_cartesian(ylim = c(0, 10)) +
theme_bw(base_size = 20) +
theme(
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
p_fdr
p_r <- manhattan %>%
ggplot(aes(x = BPcum, y = r)) +
geom_point(aes(color = as.factor(CHR)), size = 1.3) +
geom_point(
data = manhattan %>% filter(FDR >= significance_threshold),
aes(color = as.factor(CHR)),
fill = "white",
shape = 21,
size = 1.3
) +
scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
xlab("Chromosome") +
scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
coord_cartesian(ylim = c(-1.1, 1.1)) +
theme_bw(base_size = 20) +
theme(
legend.position = "none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
p_r
combined <- ggpubr::ggarrange(
p_distribution,
p_fdr,
p_r,
p_genotype,
nrow = 4,
align = "hv",
heights = c(2, 2, 3, 3)
)
combined
# LD statistics
ld_summary <- inv4m_cor %>%
group_by(CHR) %>%
summarise(
total = n(),
significant = sum(is_significant, na.rm = TRUE),
pct_sig = round(100 * significant / total, 1),
mean_r = round(mean(r, na.rm = TRUE), 3),
mean_R2 = round(mean(R2, na.rm = TRUE), 3)
)
knitr::kable(ld_summary, caption = "LD summary by chromosome")
| CHR | total | significant | pct_sig | mean_r | mean_R2 |
|---|---|---|---|---|---|
| 1 | 2173 | 31 | 1.4 | 0.038 | 0.104 |
| 2 | 1198 | 9 | 0.8 | -0.001 | 0.092 |
| 3 | 1069 | 12 | 1.1 | 0.027 | 0.095 |
| 4 | 8892 | 7623 | 85.7 | 0.801 | 0.829 |
| 5 | 1595 | 37 | 2.3 | 0.051 | 0.112 |
| 6 | 1885 | 5 | 0.3 | 0.028 | 0.085 |
| 7 | 731 | 4 | 0.5 | 0.015 | 0.091 |
| 8 | 854 | 28 | 3.3 | 0.043 | 0.114 |
| 9 | 762 | 8 | 1.0 | 0.041 | 0.093 |
| 10 | 702 | 23 | 3.3 | 0.002 | 0.114 |
# Genotype composition
geno_summary <- by_length %>%
group_by(group, Genotype) %>%
summarise(
mean_length = round(mean(length), 1),
mean_pct = round(mean(pct), 1),
.groups = "drop"
)
knitr::kable(geno_summary, caption = "Mean genotype composition")
| group | Genotype | mean_length | mean_pct |
|---|---|---|---|
| CTRL | B73 | 1849.3 | 86.9 |
| CTRL | Het | 170.0 | 8.0 |
| CTRL | Mi21 | 102.0 | 4.8 |
| CTRL | missing | 6.8 | 0.3 |
| Inv4m | B73 | 1795.9 | 84.4 |
| Inv4m | Het | 194.6 | 9.1 |
| Inv4m | Mi21 | 129.8 | 6.1 |
| Inv4m | missing | 7.8 | 0.4 |
Chromosome 4 enrichment: 98% of Inv4m-correlated SNPs map to Chr4
Perfect LD markers: 3758 SNPs show R² = 1 with tagging marker
Genes outside Inv4m to filter: 72 genes (dispersed + anti-correlated regions)
Introgression extent: Shared segment spans 38.9 Mb vs. 15.2 Mb for Inv4m core
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] cowplot_1.2.0 rtracklayer_1.70.0 GenomicRanges_1.62.0
## [4] Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
## [7] BiocGenerics_0.56.0 generics_0.1.4 ggtext_0.1.2
## [10] ggplot2_4.0.1 dplyr_1.1.4 vcfR_1.15.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 permute_0.9-8
## [3] rlang_1.1.6 magrittr_2.0.4
## [5] matrixStats_1.5.0 compiler_4.5.1
## [7] mgcv_1.9-4 vctrs_0.6.5
## [9] stringr_1.6.0 pkgconfig_2.0.3
## [11] crayon_1.5.3 memuse_4.2-3
## [13] fastmap_1.2.0 backports_1.5.0
## [15] XVector_0.50.0 labeling_0.4.3
## [17] utf8_1.2.6 Rsamtools_2.26.0
## [19] rmarkdown_2.30 markdown_2.0
## [21] ggbeeswarm_0.7.2 purrr_1.2.0
## [23] xfun_0.54 cachem_1.1.0
## [25] cigarillo_1.0.0 litedown_0.8
## [27] jsonlite_2.0.0 DelayedArray_0.36.0
## [29] BiocParallel_1.44.0 broom_1.0.10
## [31] parallel_4.5.1 cluster_2.1.8.1
## [33] R6_2.6.1 bslib_0.9.0
## [35] stringi_1.8.7 RColorBrewer_1.1-3
## [37] car_3.1-3 jquerylib_0.1.4
## [39] Rcpp_1.1.0 SummarizedExperiment_1.40.0
## [41] knitr_1.50 Matrix_1.7-4
## [43] splines_4.5.1 tidyselect_1.2.1
## [45] rstudioapi_0.17.1 abind_1.4-8
## [47] yaml_2.3.10 vegan_2.7-2
## [49] codetools_0.2-20 curl_7.0.0
## [51] lattice_0.22-7 tibble_3.3.0
## [53] Biobase_2.70.0 withr_3.0.2
## [55] S7_0.2.1 evaluate_1.0.5
## [57] pinfsc50_1.3.0 xml2_1.5.0
## [59] Biostrings_2.78.0 pillar_1.11.1
## [61] ggpubr_0.6.2 MatrixGenerics_1.22.0
## [63] carData_3.0-5 RCurl_1.98-1.17
## [65] commonmark_2.0.0 scales_1.4.0
## [67] glue_1.8.0 tools_4.5.1
## [69] BiocIO_1.20.0 ggsignif_0.6.4
## [71] GenomicAlignments_1.46.0 XML_3.99-0.20
## [73] grid_4.5.1 tidyr_1.3.1
## [75] ape_5.8-1 nlme_3.1-168
## [77] beeswarm_0.4.0 vipor_0.4.7
## [79] restfulr_0.0.16 Formula_1.2-5
## [81] cli_3.6.5 S4Arrays_1.10.0
## [83] viridisLite_0.4.2 gtable_0.3.6
## [85] rstatix_0.7.3 sass_0.4.10
## [87] digest_0.6.39 SparseArray_1.10.2
## [89] rjson_0.2.23 farver_2.1.2
## [91] htmltools_0.5.8.1 lifecycle_1.0.4
## [93] httr_1.4.7 gridtext_0.1.5
## [95] MASS_7.3-65