Overview

Purpose: Plot genotype per NIL. Characterize linkage disequilibrium between the Inv4m chromosomal inversion and genome-wide markers in maize near-isogenic lines.

Approach: Calculate Spearman correlations between a tagging SNP within Inv4m (S4_181558683) and all genotyped markers across the genome. Identify regions with significant LD and visualize genotype structure.

Expected outcome: Identify the physical extent of linkage with Inv4m, detect recombination breakpoints, and characterize flanking introgression segments.

Load Libraries

library(vcfR)
library(dplyr)
library(ggplot2)
library(ggtext)
library(rtracklayer)
library(GenomicRanges)
library(cowplot)

Define Genomic Regions

Purpose: Establish coordinates for Inv4m inversion and shared introgression.

Approach: Extract gene boundaries from B73 v5.0 annotation to define inversion breakpoints identified in MCScan analysis.

# Load B73 annotation
myGFF <- "/Users/fvrodriguez/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
B73 <- rtracklayer::import(myGFF) %>%
  subset(type == "gene" & seqnames %in% 1:10)

genes <- as.data.frame(B73)
genes$ID <- gsub("gene:", "", genes$ID)

# Inv4m boundaries (from MCScan)
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

# Shared introgression boundaries (empirical)
introgression_start <- 157012149
introgression_end <- 195900523

# Create GRanges objects
inv4m <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = inv4m_start, end = inv4m_end),
  strand = "+"
)

shared_introgression <- GRanges(
  seqnames = "4",
  ranges = IRanges(start = introgression_start, end = introgression_end),
  strand = "+"
)

# Region sizes
inv4m_size <- round((inv4m_end - inv4m_start) / 1e6, 1)
introgression_size <- round((introgression_end - introgression_start) / 1e6, 1)

# Count genes in regions
olap_inv4m <- findOverlaps(inv4m, B73, ignore.strand = TRUE)
inv4m_gene_ids <- genes$ID[subjectHits(olap_inv4m)]

olap_introgression <- findOverlaps(shared_introgression, B73, ignore.strand = TRUE)
shared_introgression_gene_ids <- genes$ID[subjectHits(olap_introgression)]

flanking_introgression_gene_ids <- shared_introgression_gene_ids[
  !(shared_introgression_gene_ids %in% inv4m_gene_ids)
]

cat("Inv4m inversion:", inv4m_size, "Mb |", length(inv4m_gene_ids), "genes\n")

## Inv4m inversion: 15.2 Mb | 434 genes

cat("Shared introgression:", introgression_size, "Mb |", 
    length(shared_introgression_gene_ids), "genes\n")

## Shared introgression: 38.9 Mb | 1101 genes

cat("Flanking regions:", 
    round(introgression_size - inv4m_size, 1), "Mb |", 
    length(flanking_introgression_gene_ids), "genes\n")

## Flanking regions: 23.7 Mb | 667 genes

Import and Process VCF

Purpose: Load genotype data and convert to numeric matrix.

Approach: Read VCF, extract GT field, recode genotypes as 0/1/2 for B73/Het/Mi21.

# Read VCF
vcf <- read.vcfR("~/Desktop/inv4m_PSU_imputed.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 10
##   header_line: 11
##   variant count: 19861
##   column count: 22
## Meta line 10 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 19861
##   Character matrix gt cols: 22
##   skip: 0
##   nrows: 19861
##   row_num: 0
## Processed variant 1000Processed variant 2000Processed variant 3000Processed variant 4000Processed variant 5000Processed variant 6000Processed variant 7000Processed variant 8000Processed variant 9000Processed variant 10000Processed variant 11000Processed variant 12000Processed variant 13000Processed variant 14000Processed variant 15000Processed variant 16000Processed variant 17000Processed variant 18000Processed variant 19000Processed variant: 19861
## All variants processed

gt <- extract.gt(vcf, element = "GT")

# Recode genotypes
gt[gt == "0/0"] <- 0
gt[gt == "0/1" | gt == "1/0"] <- 1
gt[gt == "1/1"] <- 2

# Convert to numeric matrix
ngt <- matrix(as.integer(gt), ncol = ncol(gt))
colnames(ngt) <- colnames(gt)
rownames(ngt) <- rownames(gt)

# Sample ordering (CTRL then INV4 carriers, alphabetically within group)
sample_order <- c(
  sort(c("3095", "3113", "4113", "3083", "4107", "4122")),
  sort(c("3092", "3080", "3059", "3131", "4125", "4080", "4131"))
)

ngt <- ngt[, sample_order] %>% t()

# Genotype distribution (by site)
total_sites <- ncol(ngt) * nrow(ngt)
geno_freq <- table(ngt) / total_sites
cat("Genotype frequencies (by site):\n")

## Genotype frequencies (by site):

cat("  B73/B73 (0):", round(geno_freq["0"], 3), "\n")

##   B73/B73 (0): 0.636

cat("  B73/Mi21 (1):", round(geno_freq["1"], 3), "\n")

##   B73/Mi21 (1): 0.09

cat("  Mi21/Mi21 (2):", round(geno_freq["2"], 3), "\n")

##   Mi21/Mi21 (2): 0.273

cat("  Missing:", round(sum(is.na(ngt)) / total_sites, 4), "\n")

##   Missing: 0.0016

cat("Dimensions:", nrow(ngt), "samples ×", ncol(ngt), "markers\n")

## Dimensions: 13 samples × 19861 markers

Genotype Run Analysis

Purpose: Quantify introgression content across samples.

Approach: Calculate run-length encoding of genotypes, sum by allelic state.

get_genotype_runs <- function(x, sample_order) {
  result <- data.frame()
  chr_levels <- paste0("S", 1:10)
  
  for (ind_idx in sample_order) {
    result_ind <- data.frame()
    
    for (chr_idx in 1:10) {
      chr_prefix <- paste0("S", chr_idx, "_")
      chr_name <- paste0("S", chr_idx)
      chr <- x[grepl(chr_prefix, rownames(x), perl = TRUE), ]
      runs <- rle(chr[, ind_idx])
      
      chr_long <- data.frame(
        ind = ind_idx,
        chr = chr_name,
        pos = as.integer(gsub(chr_prefix, "", names(runs$values))),
        gt = runs$values
      ) %>%
        group_by(ind, chr) %>%
        mutate(
          lag = lag(pos, default = 0),
          span = (pos - lag) / 1e6,
          chr = factor(chr, levels = chr_levels),
          ind = factor(ind, levels = sample_order),
          missing = is.na(gt),
          missing_length = missing * span,
          B73 = as.numeric(gt == 0),
          B73_length = B73 * span,
          Het = as.numeric(gt == 1),
          Het_length = Het * span,
          Mi21 = as.numeric(gt == 2),
          Mi21_length = Mi21 * span
        )
      
      result_ind <- rbind(result_ind, chr_long)
    }
    
    result <- rbind(result, result_ind)
  }
  result
}

# Calculate runs
m <- t(ngt)
gt_runs <- get_genotype_runs(m, sample_order)

# Add grouping
gt_runs$group <- case_when(
  gt_runs$ind %in% c("3092", "3080", "3059", "3131", "4125", "4080", "4131") ~ 
    "Inv4m",
  .default = "CTRL"
)

# Genome size
chr_length <- gt_runs %>%
  group_by(chr) %>%
  summarise(chr_length = max(pos) / 1e6)
genome_size <- sum(chr_length$chr_length)

# Summarize by genotype
by_length <- gt_runs %>%
  inner_join(chr_length, by = "chr") %>%
  ungroup() %>%
  group_by(ind) %>%
  summarise(
    B73 = sum(B73_length, na.rm = TRUE),
    Het = sum(Het_length, na.rm = TRUE),
    Mi21 = sum(Mi21_length, na.rm = TRUE),
    missing = sum(missing_length, na.rm = TRUE),
    group = group[1]
  ) %>%
  mutate(group = factor(group, levels = c("CTRL", "Inv4m"))) %>%
  tidyr::pivot_longer(
    cols = c("B73", "Het", "Mi21", "missing"),
    names_to = "Genotype",
    values_to = "length"
  ) %>%
  mutate(pct = 100 * length / genome_size)

cat("Genome size:", round(genome_size, 1), "Mb\n")

## Genome size: 2128.1 Mb

cat("\nGenotype run length summary (Mb):\n")

## 
## Genotype run length summary (Mb):

by_length %>%
  group_by(Genotype) %>%
  summarise(
    mean_Mb = round(mean(length), 1),
    mean_pct = round(mean(pct), 2)
  ) %>%
  print()

## # A tibble: 4 × 3
##   Genotype mean_Mb mean_pct
##   <chr>      <dbl>    <dbl>
## 1 B73       1821.     85.6 
## 2 Het        183.      8.61
## 3 Mi21       117.      5.5 
## 4 missing      7.3     0.34

cat("\nBy introgression group:\n")

## 
## By introgression group:

by_length %>%
  group_by(group, Genotype) %>%
  summarise(
    mean_Mb = round(mean(length), 1),
    mean_pct = round(mean(pct), 2),
    .groups = "drop"
  ) %>%
  print()

## # A tibble: 8 × 4
##   group Genotype mean_Mb mean_pct
##   <fct> <chr>      <dbl>    <dbl>
## 1 CTRL  B73       1849.     86.9 
## 2 CTRL  Het        170       7.99
## 3 CTRL  Mi21       102       4.79
## 4 CTRL  missing      6.8     0.32
## 5 Inv4m B73       1796.     84.4 
## 6 Inv4m Het        195.      9.15
## 7 Inv4m Mi21       130.      6.1 
## 8 Inv4m missing      7.8     0.37

Genotype Visualization

Purpose: Create chromosome-wide haplotype plots.

Approach: Plot genotype runs as colored bars across chromosomes.

# Genotype color breaks
breaks <- c(0, 1.5, 2.5)
names(breaks) <- c("B73/B73", "B73/Mi21", "Mi21/Mi21")

# Selection marker triangle
triangle <- data.frame(
  pos = 0,
  span = 181.6,
  ind = sample_order[length(sample_order)],
  chr = "4",
  label = factor("Selection marker<br>group")
)

# Coordinate limits
inversion_limit <- round(c(inv4m_start, inv4m_end) / 1e6, 2)
introgression_limit <- round(c(introgression_start, introgression_end) / 1e6, 2)

gt_plot <- gt_runs %>%
  ungroup() %>%
  mutate(chr = gsub("S", "", chr)) %>%
  ggplot(aes(y = ind, x = span, fill = gt)) +
  geom_col(width = 1) +
  scale_fill_viridis_b(
    breaks = breaks,
    limits = c(-0.5, 2.5),
    show.limits = TRUE,
    na.value = "gray85",
    direction = -1
  ) +
  geom_point(
    data = triangle,
    mapping = aes(shape = label),
    position = position_nudge(y = 2),
    size = 1,
    color = "#43137D",
    fill = "#43137D"
  ) +
  scale_shape_manual(values = 25) +
  facet_wrap(factor(chr, 1:10) ~ ., strip.position = "left", ncol = 1) +
  coord_cartesian(xlim = c(0, 450), expand = FALSE) +
  xlab("Position [Mb]") +
  ylab("Chromosome") +
  labs(fill = "Genotype") +
  guides(
    fill = guide_colorsteps(
      order = 1,
      label.position = "left",
      label.hjust = 1,
      theme = theme(
        legend.title = element_text(size = 8, hjust = 0),
        legend.text = element_text(size = 8, vjust = 1.5),
        legend.key.width = unit(1, "lines"),
        legend.key.height = unit(2.7, "lines")
      )
    ),
    shape = guide_legend(
      order = 2,
      override.aes = list(size = 4),
      label.position = "left",
      label.hjust = 1,
      theme = theme(
        legend.title = element_blank(),
        legend.text = element_markdown(size = 8),
        legend.key.width = unit(1, "lines")
      )
    )
  ) +
  ggpubr::theme_classic2(base_size = 4) +
  theme(
    legend.position = c(0.9, 0.85),
    legend.spacing.y = unit(0.1, "lines"),
    legend.box.just = "right",
    strip.background = element_blank(),
    strip.text.y.left = element_text(size = 12, angle = 0),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(size = 14),
    axis.title.x = element_text(size = 14),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank()
  )

gt_plot

# Chr4 detailed inset
inlay <- gt_runs %>%
  filter(chr == "S4") %>%
  ungroup() %>%
  mutate(chr = gsub("S", "", chr)) %>%
  ggplot(aes(x = span, y = ind, fill = gt)) +
  geom_col(width = 1) +
  scale_fill_viridis_b(
    breaks = breaks,
    limits = c(-0.5, 2.5),
    show.limits = TRUE,
    na.value = "gray85",
    direction = -1
  ) +
  annotate("point",
    x = 181.6, y = 14.4,
    shape = 25, size = 4, color = "#43137D", fill = "#43137D"
  ) +
  annotate("text",
    x = c(inversion_limit, introgression_limit), y = 14.4,
    label = rep("|", 4), size = 4, color = "#43137D"
  ) +
  coord_cartesian(xlim = c(150, 200)) +
  xlab("Chromosome 4") +
  ggpubr::theme_classic2(base_size = 8) +
  theme(
    legend.position = "none",
    strip.background = element_blank(),
    strip.text = element_blank(),
    plot.title = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    plot.background = element_rect(fill = "transparent", color = NA)
  )

# Combine plots
plot_with_inset <- ggdraw() +
  draw_plot(gt_plot) +
  draw_plot(inlay,
    x = 0.45, y = 0.03,
    width = 0.6, height = 0.6,
    scale = 0.8
  )

plot_with_inset

# Background means
bg_mean <- by_length %>%
  ungroup() %>%
  group_by(Genotype) %>%
  summarise_at(.vars = c("length", "pct"), .funs = mean)

p_genotype <- by_length %>%
  ggplot(aes(x = group, y = pct)) +
  geom_hline(data = bg_mean, aes(yintercept = pct), linetype = 2) +
  geom_boxplot(outlier.colour = "white", width = 0.25) +
  ggbeeswarm::geom_quasirandom(size = 2) +
  facet_wrap(. ~ Genotype, scales = "free_y", ncol = 4) +
  xlab("Introgression group") +
  ylab("% of genome length") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(strip.background = element_blank())

p_genotype

Calculate Linkage Disequilibrium

Purpose: Quantify correlation between tagging SNP and all genome-wide markers.

Approach: Spearman correlation with S4_181558683 (within Inv4m). FDR correction for multiple testing.

# Calculate correlations
tagging_snp <- "S4_181558683"
cor_matrix <- cor(
  ngt[, tagging_snp, drop = FALSE],
  ngt,
  use = "pairwise.complete.obs"
) %>% t()

# Statistical testing
cor_pvalues <- apply(ngt, 2, function(x) {
  cor.test(x, y = ngt[, tagging_snp], method = "spearman")$p.value
})

# Combine results
significance_threshold <- 0.01
inv4m_cor <- data.frame(
  marker = rownames(cor_matrix),
  r = cor_matrix[, 1]
) %>%
  tidyr::separate(marker, into = c("CHR", "BP"), sep = "_") %>%
  mutate(
    BP = as.numeric(BP),
    CHR = gsub("S", "", CHR) %>% factor(levels = as.character(1:10)),
    p = cor_pvalues,
    FDR = p.adjust(p, method = "fdr"),
    R2 = r^2,
    is_significant = FDR < significance_threshold
  ) %>%
  tibble::rownames_to_column("ID")

cat("Total markers:", nrow(inv4m_cor), "\n")

## Total markers: 19861

cat("Significant (FDR <", significance_threshold, "):", 
    sum(inv4m_cor$is_significant, na.rm = TRUE), 
    sprintf("(%.1f%%)", 100 * mean(inv4m_cor$is_significant, na.rm = TRUE)), "\n")

## Significant (FDR < 0.01 ): 7780 (45.4%)

cat("Perfect LD (R² = 1):", sum(inv4m_cor$R2 == 1, na.rm = TRUE), "\n")

## Perfect LD (R² = 1): 3758

hist(inv4m_cor$R2, 
     main = "R² distribution with Inv4m tagging marker\n19,861 markers total",
     xlab = "R²", 
     ylim = c(0, 8000),
     col = "skyblue",
     breaks = 30)

Identify Correlated Regions

Purpose: Map SNPs with significant LD to genomic features.

Approach: Create GRanges from significant SNPs, classify by position relative to introgression boundaries.

# All significant SNPs
correlated_SNP <- with(inv4m_cor %>% filter(is_significant), {
  GRanges(
    seqnames = CHR,
    ranges = IRanges(start = BP, end = BP),
    ID = ID
  )
})

# Outside shared introgression
outside_correlated <- inv4m_cor %>%
  filter(is_significant) %>%
  filter(!(CHR == 4 & BP >= introgression_start & BP <= introgression_end))

# Anti-correlated region (upstream of introgression)
anticorrelated_start <- 154283637
anticorrelated_end <- 156985577

anticorrelated <- correlated_SNP %>%
  subset(seqnames == 4 & start >= anticorrelated_start & end <= anticorrelated_end)

# Inside Inv4m
inside_correlated <- correlated_SNP %>%
  subset(seqnames == 4 & start >= introgression_start & end <= inv4m_end)

cat("Correlated SNPs by location:\n")

## Correlated SNPs by location:

cat("  Total significant:", length(correlated_SNP), "\n")

##   Total significant: 7780

cat("  Inside shared introgression (35 Mb):", length(inside_correlated), 
    sprintf("(%.1f%%)", 100 * length(inside_correlated) / length(correlated_SNP)), 
    "\n")

##   Inside shared introgression (35 Mb): 5984 (76.9%)

cat("  Outside shared introgression:", nrow(outside_correlated), 
    sprintf("(%.1f%%)", 100 * nrow(outside_correlated) / length(correlated_SNP)), 
    "\n")

##   Outside shared introgression: 382 (4.9%)

cat("  Anti-correlated upstream (2.7 Mb):", length(anticorrelated), "\n")

##   Anti-correlated upstream (2.7 Mb): 207

# Chromosome distribution of significant markers
cat("\nSignificant markers by chromosome:\n")

## 
## Significant markers by chromosome:

sig_by_chr <- with(inv4m_cor %>% filter(is_significant), table(CHR))
print(sig_by_chr)

## CHR
##    1    2    3    4    5    6    7    8    9   10 
##   31    9   12 7623   37    5    4   28    8   23

Gene Overlap Analysis

Purpose: Identify genes overlapping Inv4m-correlated regions to create filter list for downstream analyses.

Approach: Use GenomicRanges::findOverlaps to map significant SNPs to gene annotations. Classify genes by location: inside shared introgression, dispersed background, or anti-correlated upstream.

Expected outcome: Generate comprehensive list of genes to exclude from analyses aimed at identifying trans-regulatory effects of Inv4m.

# Load transcript annotations
transcripts_gff <- rtracklayer::import(myGFF) %>%
  subset(type == "mRNA" & seqnames %in% 1:10)

transcripts <- as.data.frame(transcripts_gff)
transcripts$gene_id <- gsub("transcript:", "", transcripts$ID)
transcripts$gene_id <- gsub("_.*", "", transcripts$gene_id, perl = TRUE)

# Convert SNP sets to GRanges for overlap
outside_correlated_gr <- with(outside_correlated, {
  GRanges(
    seqnames = CHR,
    ranges = IRanges(start = BP, end = BP),
    ID = ID
  )
})

# 1. Outside shared introgression (dispersed background)
olap_outside <- findOverlaps(outside_correlated_gr, transcripts_gff, 
                              ignore.strand = TRUE)
outside_correlated_genes <- transcripts$gene_id[subjectHits(olap_outside)] %>%
  sort() %>%
  unique()

cat("Outside shared introgression:\n")

## Outside shared introgression:

cat("  SNPs:", length(outside_correlated_gr), "\n")

##   SNPs: 382

cat("  Unique SNP positions:", length(unique(queryHits(olap_outside))), "\n")

##   Unique SNP positions: 286

cat("  Genes overlapped:", length(outside_correlated_genes), "\n")

##   Genes overlapped: 58

# 2. Anti-correlated upstream region (154.3-157.0 Mb)
olap_anti <- findOverlaps(anticorrelated, transcripts_gff, 
                           ignore.strand = TRUE)
anticorrelated_genes <- transcripts$gene_id[subjectHits(olap_anti)] %>%
  sort() %>%
  unique()

cat("\nAnti-correlated upstream (154.3-157.0 Mb):\n")

## 
## Anti-correlated upstream (154.3-157.0 Mb):

cat("  SNPs:", length(anticorrelated), "\n")

##   SNPs: 207

cat("  Unique SNP positions:", length(unique(queryHits(olap_anti))), "\n")

##   Unique SNP positions: 177

cat("  Genes overlapped:", length(anticorrelated_genes), "\n")

##   Genes overlapped: 14

# 3. Inside shared introgression (157.0-195.9 Mb, 35 Mb)
olap_inside <- findOverlaps(inside_correlated, transcripts_gff, 
                             ignore.strand = TRUE)
inside_correlated_genes <- transcripts$gene_id[subjectHits(olap_inside)] %>%
  sort() %>%
  unique()

cat("\nInside shared introgression (157.0-195.9 Mb):\n")

## 
## Inside shared introgression (157.0-195.9 Mb):

cat("  SNPs:", length(inside_correlated), "\n")

##   SNPs: 5984

cat("  Unique SNP positions:", length(unique(queryHits(olap_inside))), "\n")

##   Unique SNP positions: 5666

cat("  Genes overlapped:", length(inside_correlated_genes), "\n")

##   Genes overlapped: 402

# 4. Create filter list and classification table
# Only filter genes OUTSIDE the introgression that are correlated
# Note: anticorrelated genes are a SUBSET of outside_correlated genes
# (they're outside the introgression, specifically in the upstream region)

# Filter list is simply all outside genes (includes anticorrelated as subset)
outside_inv4m_genes <- outside_correlated_genes

cat("\nGenes to filter out (outside Inv4m but correlated):", 
    length(outside_inv4m_genes), "\n")

## 
## Genes to filter out (outside Inv4m but correlated): 58

cat("  Total outside (includes anti-correlated):", length(outside_correlated_genes), "\n")

##   Total outside (includes anti-correlated): 58

cat("  Anti-correlated subset:", length(anticorrelated_genes), 
    sprintf("(%.1f%% of outside)", 100 * length(anticorrelated_genes) / length(outside_correlated_genes)), 
    "\n")

##   Anti-correlated subset: 14 (24.1% of outside)

# Create classification table for all correlated genes
# Anti-correlated genes get their own label even though they're subset of outside
gene_classification <- data.frame(
  gene_id = c(inside_correlated_genes, 
              outside_correlated_genes),
  classification = c(
    rep("inside_introgression", length(inside_correlated_genes)),
    rep("outside_dispersed", length(outside_correlated_genes))
  )
) %>%
  distinct() %>%
  mutate(
    classification = ifelse(
      gene_id %in% anticorrelated_genes,
      "outside_anticorrelated",
      classification
    )
  )

cat("\nTotal genes correlated with Inv4m:", nrow(gene_classification), "\n")

## 
## Total genes correlated with Inv4m: 460

cat("  Inside introgression:", sum(gene_classification$classification == "inside_introgression"), "\n")

##   Inside introgression: 402

cat("  Outside dispersed:", sum(gene_classification$classification == "outside_dispersed"), "\n")

##   Outside dispersed: 44

cat("  Outside anti-correlated:", sum(gene_classification$classification == "outside_anticorrelated"), "\n")

##   Outside anti-correlated: 14

# Save filter list (only outside genes)
saveRDS(outside_inv4m_genes, "inv4m_linked_genes_to_filter.rds")
write.table(
  data.frame(gene_id = outside_inv4m_genes),
  file = "inv4m_linked_genes_to_filter.txt",
  row.names = FALSE,
  quote = FALSE
)

# Save complete classification table
write.table(
  gene_classification,
  file = "inv4m_correlated_genes_classification.txt",
  row.names = FALSE,
  quote = FALSE,
  sep = "\t"
)
saveRDS(gene_classification, "inv4m_correlated_genes_classification.rds")

cat("\nFiles saved:\n")

## 
## Files saved:

cat("  Filter list (outside genes only):\n")

##   Filter list (outside genes only):

cat("    - inv4m_linked_genes_to_filter.rds\n")

##     - inv4m_linked_genes_to_filter.rds

cat("    - inv4m_linked_genes_to_filter.txt\n")

##     - inv4m_linked_genes_to_filter.txt

cat("  Classification table (all correlated genes):\n")

##   Classification table (all correlated genes):

cat("    - inv4m_correlated_genes_classification.txt\n")

##     - inv4m_correlated_genes_classification.txt

cat("    - inv4m_correlated_genes_classification.rds\n")

##     - inv4m_correlated_genes_classification.rds

Example: Applying Filter to DE Results

# Example showing how to filter DE results to exclude Inv4m-linked genes
# This removes genes outside Inv4m that show linkage (trans-effects/background)

# Load your DE results (example structure)
# de_results <- read.csv("differential_expression_results.csv")

# Filter out genes outside Inv4m that are correlated with it
de_filtered <- de_results %>%
  filter(!gene %in% outside_inv4m_genes)

# Compare before/after
cat("DE results before filtering:", nrow(de_results), "genes\n")
cat("DE results after filtering:", nrow(de_filtered), "genes\n")
cat("Genes removed:", nrow(de_results) - nrow(de_filtered), "\n")

# This filtered set excludes background linkage effects
# Genes inside Inv4m are retained (cis-regulatory effects of interest)

SNP Distribution Analysis

Purpose: Compare genome-wide vs. Inv4m-linked marker distributions.

Approach: Calculate chromosome-specific proportions for all SNPs and significant Inv4m-correlated SNPs.

inv4_distro <- data.frame(
  CHR = factor(1:10, levels = 1:10),
  all = with(inv4m_cor, table(CHR)) %>% as.numeric(),
  inv4m = with(inv4m_cor %>% filter(is_significant), table(CHR)) %>% 
    as.numeric()
) %>%
  mutate(
    all_pct = 100 * all / sum(all),
    inv4m_pct = 100 * inv4m / sum(inv4m)
  )

cat("SNP distribution:\n")

## SNP distribution:

cat("  Chr4 contains", inv4_distro$all[4], "SNPs (", 
    round(inv4_distro$all_pct[4], 1), "% of total)\n")

##   Chr4 contains 8892 SNPs ( 44.8 % of total)

cat("  Chr4 Inv4m-linked:", inv4_distro$inv4m[4], "SNPs (",
    round(inv4_distro$inv4m_pct[4], 1), "% of significant)\n")

##   Chr4 Inv4m-linked: 7623 SNPs ( 98 % of significant)

cat("  Other chromosomes average:", 
    round(mean(inv4_distro$all[-4]), 0), "SNPs per chromosome\n")

##   Other chromosomes average: 1219 SNPs per chromosome

p_distribution <- inv4_distro %>%
  tidyr::pivot_longer(
    cols = c("all_pct", "inv4m_pct"),
    names_to = "marker_type",
    values_to = "pct"
  ) %>%
  ggplot(aes(y = pct, x = CHR, group = marker_type, fill = marker_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(
    labels = c(
      sprintf("All SNPs (n=%s)", format(sum(inv4_distro$all), big.mark = ",")),
      sprintf("Inv4m-correlated (n=%s)", format(sum(inv4_distro$inv4m), big.mark = ","))
    ),
    values = c("gold", "purple4")
  ) +
  ylab("%") +
  ggpubr::theme_classic2(base_size = 20) +
  theme(
    legend.position = c(0.7, 0.8),
    legend.title = element_blank(),
    axis.title.x = element_blank()
  )

p_distribution

Manhattan Plots

Purpose: Visualize genome-wide LD patterns.

Approach: Plot -log10(FDR) and correlation coefficients across chromosomes.

# Calculate cumulative positions
manhattan <- inv4m_cor %>%
  group_by(CHR) %>%
  summarise(chr_len = max(BP)) %>%
  mutate(tot = cumsum(chr_len) - chr_len) %>%
  select(-chr_len) %>%
  left_join(inv4m_cor, ., by = "CHR") %>%
  arrange(CHR, BP) %>%
  mutate(BPcum = BP + tot)

# Chromosome centers for x-axis
axisdf <- manhattan %>%
  group_by(CHR) %>%
  summarize(center = (max(BPcum) + min(BPcum)) / 2)

# Handle zero FDR values
manhattan$FDR[manhattan$FDR == 0] <- 1e-8

p_fdr <- manhattan %>%
  ggplot(aes(x = BPcum, y = -log10(FDR))) +
  geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
  scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
  geom_hline(yintercept = -log10(significance_threshold), col = "red") +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  coord_cartesian(ylim = c(0, 10)) +
  theme_bw(base_size = 20) +
  theme(
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

p_fdr

p_r <- manhattan %>%
  ggplot(aes(x = BPcum, y = r)) +
  geom_point(aes(color = as.factor(CHR)), size = 1.3) +
  geom_point(
    data = manhattan %>% filter(FDR >= significance_threshold),
    aes(color = as.factor(CHR)),
    fill = "white",
    shape = 21,
    size = 1.3
  ) +
  scale_color_manual(values = rep(c("gold", "purple4"), 22)) +
  xlab("Chromosome") +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  coord_cartesian(ylim = c(-1.1, 1.1)) +
  theme_bw(base_size = 20) +
  theme(
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

p_r

Combined Figure

combined <- ggpubr::ggarrange(
  p_distribution,
  p_fdr,
  p_r,
  p_genotype,
  nrow = 4,
  align = "hv",
  heights = c(2, 2, 3, 3)
)

combined

Summary

# LD statistics
ld_summary <- inv4m_cor %>%
  group_by(CHR) %>%
  summarise(
    total = n(),
    significant = sum(is_significant, na.rm = TRUE),
    pct_sig = round(100 * significant / total, 1),
    mean_r = round(mean(r, na.rm = TRUE), 3),
    mean_R2 = round(mean(R2, na.rm = TRUE), 3)
  )

knitr::kable(ld_summary, caption = "LD summary by chromosome")

LD summary by chromosome
CHR	total	significant	pct_sig	mean_r	mean_R2
1	2173	31	1.4	0.038	0.104
2	1198	9	0.8	-0.001	0.092
3	1069	12	1.1	0.027	0.095
4	8892	7623	85.7	0.801	0.829
5	1595	37	2.3	0.051	0.112
6	1885	5	0.3	0.028	0.085
7	731	4	0.5	0.015	0.091
8	854	28	3.3	0.043	0.114
9	762	8	1.0	0.041	0.093
10	702	23	3.3	0.002	0.114

# Genotype composition
geno_summary <- by_length %>%
  group_by(group, Genotype) %>%
  summarise(
    mean_length = round(mean(length), 1),
    mean_pct = round(mean(pct), 1),
    .groups = "drop"
  )

knitr::kable(geno_summary, caption = "Mean genotype composition")

Mean genotype composition
group	Genotype	mean_length	mean_pct
CTRL	B73	1849.3	86.9
CTRL	Het	170.0	8.0
CTRL	Mi21	102.0	4.8
CTRL	missing	6.8	0.3
Inv4m	B73	1795.9	84.4
Inv4m	Het	194.6	9.1
Inv4m	Mi21	129.8	6.1
Inv4m	missing	7.8	0.4

Key Findings

Chromosome 4 enrichment: 98% of Inv4m-correlated SNPs map to Chr4
Perfect LD markers: 3758 SNPs show R² = 1 with tagging marker
Genes outside Inv4m to filter: 72 genes (dispersed + anti-correlated regions)
Introgression extent: Shared segment spans 38.9 Mb vs. 15.2 Mb for Inv4m core

Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] cowplot_1.2.0        rtracklayer_1.70.0   GenomicRanges_1.62.0
##  [4] Seqinfo_1.0.0        IRanges_2.44.0       S4Vectors_0.48.0    
##  [7] BiocGenerics_0.56.0  generics_0.1.4       ggtext_0.1.2        
## [10] ggplot2_4.0.1        dplyr_1.1.4          vcfR_1.15.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9                permute_0.9-8              
##  [3] rlang_1.1.6                 magrittr_2.0.4             
##  [5] matrixStats_1.5.0           compiler_4.5.1             
##  [7] mgcv_1.9-4                  vctrs_0.6.5                
##  [9] stringr_1.6.0               pkgconfig_2.0.3            
## [11] crayon_1.5.3                memuse_4.2-3               
## [13] fastmap_1.2.0               backports_1.5.0            
## [15] XVector_0.50.0              labeling_0.4.3             
## [17] utf8_1.2.6                  Rsamtools_2.26.0           
## [19] rmarkdown_2.30              markdown_2.0               
## [21] ggbeeswarm_0.7.2            purrr_1.2.0                
## [23] xfun_0.54                   cachem_1.1.0               
## [25] cigarillo_1.0.0             litedown_0.8               
## [27] jsonlite_2.0.0              DelayedArray_0.36.0        
## [29] BiocParallel_1.44.0         broom_1.0.10               
## [31] parallel_4.5.1              cluster_2.1.8.1            
## [33] R6_2.6.1                    bslib_0.9.0                
## [35] stringi_1.8.7               RColorBrewer_1.1-3         
## [37] car_3.1-3                   jquerylib_0.1.4            
## [39] Rcpp_1.1.0                  SummarizedExperiment_1.40.0
## [41] knitr_1.50                  Matrix_1.7-4               
## [43] splines_4.5.1               tidyselect_1.2.1           
## [45] rstudioapi_0.17.1           abind_1.4-8                
## [47] yaml_2.3.10                 vegan_2.7-2                
## [49] codetools_0.2-20            curl_7.0.0                 
## [51] lattice_0.22-7              tibble_3.3.0               
## [53] Biobase_2.70.0              withr_3.0.2                
## [55] S7_0.2.1                    evaluate_1.0.5             
## [57] pinfsc50_1.3.0              xml2_1.5.0                 
## [59] Biostrings_2.78.0           pillar_1.11.1              
## [61] ggpubr_0.6.2                MatrixGenerics_1.22.0      
## [63] carData_3.0-5               RCurl_1.98-1.17            
## [65] commonmark_2.0.0            scales_1.4.0               
## [67] glue_1.8.0                  tools_4.5.1                
## [69] BiocIO_1.20.0               ggsignif_0.6.4             
## [71] GenomicAlignments_1.46.0    XML_3.99-0.20              
## [73] grid_4.5.1                  tidyr_1.3.1                
## [75] ape_5.8-1                   nlme_3.1-168               
## [77] beeswarm_0.4.0              vipor_0.4.7                
## [79] restfulr_0.0.16             Formula_1.2-5              
## [81] cli_3.6.5                   S4Arrays_1.10.0            
## [83] viridisLite_0.4.2           gtable_0.3.6               
## [85] rstatix_0.7.3               sass_0.4.10                
## [87] digest_0.6.39               SparseArray_1.10.2         
## [89] rjson_0.2.23                farver_2.1.2               
## [91] htmltools_0.5.8.1           lifecycle_1.0.4            
## [93] httr_1.4.7                  gridtext_0.1.5             
## [95] MASS_7.3-65

NIL Genotype Plotting and Genotype Correlation Analysis of Inv4m Introgression

Fausto Rodriguez Zapata

2025-12-05