Overview

This document outlines a genomic ancestry analysis pipeline for maize introgression lines. The analysis identifies genomic ancestry across bins of size 1Mb using the following approach:

  1. Initial Classification: Bins with no alternative alleles (ALT_FREQ = 0) are automatically classified as “REF”
  2. Clustering: For the remaining bins (ALT_FREQ > 0), multiple clustering methods are applied with k=3
  3. Method Comparison: Four different clustering approaches are compared:
    • K-means with weighted ALT_FREQ
    • K-means with arcsin transformation
    • K-means with log10 transformation
    • Gaussian mixture model
  4. HMM Smoothing: Hidden Markov Model smoothing is applied to reduce noise and enforce biological constraints
  5. Validation: Methods are compared for consistency and biological plausibility

Load Required Packages

# Load required libraries
library(dplyr)        # Data manipulation
library(ggplot2)      # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix)       # Gaussian mixture model
library(HMM)          # Hidden Markov Models
library(tidyr)        # Data reshaping
library(scales)       # Unit formatting for plots
library(ggpubr)       # Plot arrangement
library(tibble)       # Modern data frames

Data Preparation and Initial Exploration

# Read the binned allelic counts
read_freq <- read.table("~/Desktop/PN17_SID1632_bin_genotypes.tsv", header=TRUE)

# Display first few rows of the binned data
head(read_freq)
##   CONTIG BIN_POS VARIANT_COUNT INFORMATIVE_VARIANT_COUNT DEPTH_SUM ALT_COUNT
## 1   chr1       1          8163                      3810      7355        34
## 2   chr1       2         13158                      5512     10174        20
## 3   chr1       3         14272                      6128     11390        25
## 4   chr1       4         11263                      5084      9273        16
## 5   chr1       5         12470                      5440     10589        38
## 6   chr1       6         10971                      4497      8514        21
##      ALT_FREQ BIN_START BIN_END ALT_FREQ_NORM GENOTYPE
## 1 0.004622706      5892  999990   0.002914504      REF
## 2 0.001965795   1000069 1999892   0.001997776      REF
## 3 0.002194908   2000252 2999995   0.002419468      REF
## 4 0.001725439   3000032 3989703   0.001500972      REF
## 5 0.003588630   4002810 4999974   0.003456319      REF
## 6 0.002466526   5000001 5997031   0.002090021      REF
# Calculate summary statistics
IVC_mean <- mean(read_freq$INFORMATIVE_VARIANT_COUNT)
IVC_sd <- sd(read_freq$INFORMATIVE_VARIANT_COUNT)

# Display summary of bins
cat("Total number of bins:", nrow(read_freq), "\n")
## Total number of bins: 2132
cat("Average depth sum:", mean(read_freq$DEPTH_SUM), "\n")
## Average depth sum: 9686.165
cat("Average number of informative variants per bin:", IVC_mean, "\n")
## Average number of informative variants per bin: 5223.613
cat("Median number of informative variants per bin:", median(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")
## Median number of informative variants per bin: 5093.5
cat("Range of informative variants per bin:", min(read_freq$INFORMATIVE_VARIANT_COUNT), "to", max(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")
## Range of informative variants per bin: 13 to 10052

Weighting ALT_FREQ by informative variant count

# Create weightesALT_FREQ 
read_freq <- read_freq %>%
  mutate(
    ALT_FREQ_WEIGHTED = ALT_FREQ * (INFORMATIVE_VARIANT_COUNT / IVC_mean)
  )

# Visualize the relationship between informative variant count and ALT_FREQ
ggplot(read_freq, aes(x = INFORMATIVE_VARIANT_COUNT, y = ALT_FREQ)) +
  geom_point(alpha = 0.6) +
  labs(title = paste("Relationship between informative variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
       x = "Number of informative variants in bin", 
       y = "Alternative allele frequency") +
  theme_minimal()

# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
  theme_minimal()

p2 <- ggplot(read_freq, aes(x = ALT_FREQ_WEIGHTED)) +
  geom_histogram(bins = 30, fill = "tomato", color = "white") +
  labs(title = "Weighted ALT_FREQ distribution", x = "ALT_FREQ_WEIGHTED") +
  theme_minimal()

ggpubr::ggarrange(p1, p2, ncol = 2)

Zero-ALT Bin Identification

# Check how many bins have ALT_FREQ = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_WEIGHTED == 0)
total_bins <- nrow(read_freq)

cat("Bins with ALT_FREQ_WEIGHTED = 0:", 
    zero_alt_bins, "out of", total_bins, 
    "(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")
## Bins with ALT_FREQ_WEIGHTED = 0: 5 out of 2132 ( 0.2 %)
# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
  filter(ALT_FREQ_WEIGHTED > 0)

cat("Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered):", 
    nrow(non_zero_bins), "out of", total_bins,
    "(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")
## Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered): 2127 out of 2132 ( 99.8 %)

Data Transformation for Clustering

# Create transformations for better clustering performance
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_WEIGHTED)
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_WEIGHTED))

# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_WEIGHTED)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = paste("Weighted ALT_FREQ distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""), 
       x = "ALT_FREQ_WEIGHTED") +
  theme_minimal()

p2 <- ggplot(non_zero_bins, aes(x = asin)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Arcsin transformation (non-zero bins)",
       x = "asin(sqrt(ALT_FREQ_WEIGHTED))") +
  theme_minimal()

p3 <- ggplot(non_zero_bins, aes(x = log)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Log10 transformation (non-zero bins)",
       x = "log10(ALT_FREQ_WEIGHTED)") +
  theme_minimal()

ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)

Clustering Methods Application

K <- 3

# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_WEIGHTED, K)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, K)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, K)$cluster)

# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ
relabel_clusters <- function(clusters, data) {
  # Calculate mean ALT_FREQ for each cluster
  cluster_means <- tapply(data$ALT_FREQ, clusters, mean)
  
  # Order clusters by mean ALT_FREQ
  ordered_clusters <- order(cluster_means)
  
  # Create mapping from original cluster to REF, HET, ALT
  cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
  cluster_map[ordered_clusters[1]] <- "REF"
  cluster_map[ordered_clusters[2]] <- "HET"
  cluster_map[ordered_clusters[3]] <- "ALT"
  
  # Apply mapping to original clusters
  return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}

# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)

# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")
## K-means cluster proportions (non-zero bins):
table(non_zero_bins$K) %>% prop.table() %>% round(3)
## 
##   REF   HET   ALT 
## 0.969 0.018 0.012
cat("\nLog-transformed K-means cluster proportions (non-zero bins):\n")
## 
## Log-transformed K-means cluster proportions (non-zero bins):
table(non_zero_bins$Klog) %>% prop.table() %>% round(3)
## 
##   REF   HET   ALT 
## 0.347 0.620 0.033
cat("\nArcsin-transformed K-means cluster proportions (non-zero bins):\n")
## 
## Arcsin-transformed K-means cluster proportions (non-zero bins):
table(non_zero_bins$Kasin) %>% prop.table() %>% round(3)
## 
##   REF   HET   ALT 
## 0.968 0.019 0.014

Gaussian Mixture Modeling

# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
  Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ)),
  Preprocessing = "histogram",
  cmin = 3,  # Force 3 components
  cmax = 3,
  Criterion = "BIC",
  pdf = "normal"
)

# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)

# Relabel Kgmm clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)

# Check Kgmm cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")
## Gaussian Mixture Model cluster proportions (non-zero bins):
table(non_zero_bins$Kgmm) %>% prop.table() %>% round(3)
## 
##   REF   HET   ALT 
## 0.956 0.017 0.026

Merge Clustering Results to Full Dataset

# Create new columns in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))

# Assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  # Create an index for non-zero bins in the original data
  non_zero_idx <- which(read_freq$ALT_FREQ_WEIGHTED > 0)
  
  # Assign cluster classifications to non-zero bins
  read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}

# Force bins with ALT_FREQ = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  read_freq[[method]][zero_idx] <- "REF"
}

# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")
## Final cluster proportions for raw K-means:
table(read_freq$K) %>% prop.table() %>% round(3)
## 
##   REF   HET   ALT 
## 0.970 0.018 0.012

HMM Smoothing Implementation

Theoretical Background

For maize introgression lines produced through backcrossing followed by selfing (e.g., BC2S3), we can calculate expected genotype frequencies based on Mendelian genetics:

# Create mating matrices 
create_mating_matrices <- function() {
  # Backcross matrix for crossing population with AA donor
  # Each COLUMN represents offspring distribution from that parent × AA
  # Column 1: AA × AA → [1, 0, 0] (100% AA offspring)
  # Column 2: Aa × AA → [1/2, 1/2, 0] (50% AA, 50% Aa offspring)  
  # Column 3: aa × AA → [0, 1, 0] (100% Aa offspring)
  backcross_AA <- matrix(c(
    1, 1/2, 0,    
    0, 1/2, 1,    
    0,   0, 0     
  ), nrow = 3, byrow = TRUE)
  
  # Backcross matrix for crossing population with aa donor
  # Column 1: AA × aa → [0, 1, 0] (100% Aa offspring)
  # Column 2: Aa × aa → [0, 1/2, 1/2] (50% Aa, 50% aa offspring)
  # Column 3: aa × aa → [0, 0, 1] (100% aa offspring)
  backcross_aa <- matrix(c(
    0,   0, 0,    
    1, 1/2, 0,    
    0, 1/2, 1     
  ), nrow = 3, byrow = TRUE)
  
  # Selfing matrix - each individual self-fertilizes
  # Column 1: AA selfed → [1, 0, 0] (100% AA offspring)
  # Column 2: Aa selfed → [1/4, 1/2, 1/4] (25% AA, 50% Aa, 25% aa offspring)
  # Column 3: aa selfed → [0, 0, 1] (100% aa offspring)
  selfing <- matrix(c(
    1, 1/4, 0,    
    0, 1/2, 0,    
    0, 1/4, 1     
  ), nrow = 3, byrow = TRUE)
  
  return(list(
    backcross_AA = backcross_AA,
    backcross_aa = backcross_aa,
    selfing = selfing
  ))
}

# Function to calculate genotype frequencies after breeding scheme
calculate_nil_frequencies <- function(bc=2, s=3, donor_type = "aa", from = c(0,1,0) ) {
  # Input validation
  if (bc  < 0 || s < 0) {
    stop("Number of backcrosses and self generations must be non-negative")
  }
  if (!donor_type %in% c("AA", "aa")) {
    stop("Donor type must be 'AA' or 'aa'")
  }
  
  # Get mating matrices
  matrices <- create_mating_matrices()
  
  # Initial F1 generation - all heterozygous after crossing pure lines
  # Population vector: [AA frequency, Aa frequency, aa frequency]
  current_population <- matrix(from, ncol = 1)
  
  # Apply backcrosses sequentially
  if (bc > 0) {
    # Select appropriate backcross matrix based on donor type
    backcross_matrix <- if (donor_type == "aa") {
      matrices$backcross_AA
    } else {
      matrices$backcross_aa
    }
    
    # Each backcross: Matrix %*% population_vector
    # This takes weighted average of matrix columns based on current population
    for (i in 1:bc) {
      current_population <- backcross_matrix %*% current_population
      
      # Optional: print intermediate results for debugging
      if (exists("debug_nil") && debug_nil) {
        cat("After backcross", i, ": [", 
            round(current_population[1,1], 4), ", ",
            round(current_population[2,1], 4), ", ", 
            round(current_population[3,1], 4), "]\n")
      }
    }
  }
  
  # Apply selfing generations sequentially  
  if (s > 0) {
    # Each selfing: selfing_matrix %*% population_vector
    # Each column represents offspring from selfing that genotype
    for (i in 1:s) {
      current_population <- matrices$selfing %*% current_population
      
      # Optional: print intermediate results for debugging
      if (exists("debug_nil") && debug_nil) {
        cat("After self", i, ": [", 
            round(current_population[1,1], 4), ", ",
            round(current_population[2,1], 4), ", ", 
            round(current_population[3,1], 4), "]\n")
      }
    }
  }
  
  # Extract final frequencies as named vector
  final_frequencies <- as.vector(current_population)
  names(final_frequencies) <- c("AA", "Aa", "aa")
  
  return(final_frequencies)
}

# Function to return frequencies in HMM format (REF, HET, ALT)
nil_frequencies_for_hmm <- function(bc, s, donor_type = "aa", from = c(0,1,0)) {
  # Calculate raw genotype frequencies
  genotype_freqs <- calculate_nil_frequencies(bc, s, donor_type, from)
  
  # Convert to HMM format based on which allele is being introgressed
  if (donor_type == "AA") {
    # AA is the introgressed (donor) allele
    # aa represents the recurrent parent background (REF)
    # AA represents regions with donor allele (ALT)
    hmm_frequencies <- c(
      REF = genotype_freqs["aa"],  # Recurrent parent background
      HET = genotype_freqs["Aa"],  # Heterozygous regions  
      ALT = genotype_freqs["AA"]   # Donor introgression
    )
  } else {
    # aa is the introgressed (donor) allele
    # AA represents the recurrent parent background (REF)
    # aa represents regions with donor allele (ALT)
    hmm_frequencies <- c(
      REF = genotype_freqs["AA"],  # Recurrent parent background
      HET = genotype_freqs["Aa"],  # Heterozygous regions
      ALT = genotype_freqs["aa"]   # Donor introgression  
    )
  }
  names(hmm_frequencies) <- c("REF", "HET","ALT")
  return(hmm_frequencies)
}

bc2s3 <- nil_frequencies_for_hmm(bc=2,s=3)

cat("Expected genotype frequencies for BC2S3:\n")
## Expected genotype frequencies for BC2S3:
cat("REF:", round(bc2s3["REF"], 4), "\n")
## REF: 0.8594
cat("HET:", round(bc2s3["HET"], 4), "\n") 
## HET: 0.0312
cat("ALT:", round(bc2s3["ALT"], 4), "\n")
## ALT: 0.1094

HMM Smoothing Function

# Function to apply HMM smoothing to ancestry calls
smooth_ancestry_with_hmm <- function(genotypes, transitions = c(0.96, 0.04)) {
  # Convert genotypes to numeric (0=REF, 1=HET, 2=ALT)
  geno_numeric <- as.numeric(factor(genotypes, levels=c("REF", "HET", "ALT"))) - 1
  
  # Set up transition probabilities (high probability of staying in same state)
  trans_prob <- matrix(c(
    transitions[1], transitions[2]/2, transitions[2]/2,  # From REF
    transitions[2]/2, transitions[1], transitions[2]/2,  # From HET
    transitions[2]/2, transitions[2]/2, transitions[1]   # From ALT
  ), nrow=3, byrow=TRUE)
  
  # Set up emission probabilities (how likely each state produces observed genotypes)
  emiss_prob <- matrix(c(
    0.9, 0.08, 0.02,  # REF state
    0.1, 0.8, 0.1,    # HET state
    0.02, 0.08, 0.9   # ALT state
  ), nrow=3, byrow=TRUE)
  
  # Initialize HMM with BC2S3 priors
  hmm <- initHMM(c("REF", "HET", "ALT"), c("0", "1", "2"), 
                startProbs = bc2s3,  # Prior probabilities from breeding design
                transProbs = trans_prob, 
                emissionProbs = emiss_prob)
  
  # Run Viterbi algorithm to find most likely state sequence
  viterbi_path <- viterbi(hmm, as.character(geno_numeric))
  
  # Convert back to genotype calls
  smoothed_genotypes <- factor(viterbi_path, levels=c("REF", "HET", "ALT"))
  
  return(smoothed_genotypes)
}

Apply HMM Smoothing

# Apply HMM smoothing to each chromosome separately
read_freq <- read_freq %>%
  group_by(CONTIG) %>%
  mutate(
    K_HMM = smooth_ancestry_with_hmm(K, transitions=c(0.995, 0.005)),
    Kasin_HMM = smooth_ancestry_with_hmm(Kasin, transitions=c(0.995, 0.005)),
    Klog_HMM = smooth_ancestry_with_hmm(Klog, transitions=c(0.995, 0.005)),
    Kgmm_HMM = smooth_ancestry_with_hmm(Kgmm, transitions=c(0.995, 0.005))
  ) %>%
  ungroup()

# Ensure chromosome factor ordering
read_freq$CONTIG <- factor(read_freq$CONTIG, levels = paste0("chr", 1:10))

Visualization and Method Comparison

Single Method Visualization

# Define color palette
pal <- c("REF" = "gold", "HET" = "springgreen4", "ALT" = "purple4")

# Create tile plot function
create_tile <- function(data, method_col, title) {
  data %>%
    mutate(
      CONTIG = factor(CONTIG, levels = paste0("chr", 1:10)),
      POS = BIN_POS * BIN_SIZE
    ) %>%
    ggplot(aes(x = POS, y = 1, fill = .data[[method_col]])) +
    geom_tile() +
    facet_wrap( .~CONTIG, ncol = 1, strip.position = "left") +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
    scale_fill_manual(values = pal) +
    labs(title = title, x = "Position (bp)", fill = "Genotype") +
    ggpubr::theme_classic2(base_size = 10) +
    theme(
      legend.position = "bottom",
      strip.background = element_blank(),
      strip.text.y.left = element_text(angle = 0, hjust = 0),
      panel.spacing.y = unit(0.0, "lines"),
      axis.line.y = element_blank(),
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank()
    ) 
}

HMM Smoothing Effect

# Compare before and after HMM smoothing
create_tile(read_freq, "Kgmm", paste("Before HMM Smoothing |", BIN_SIZE_LABEL, "bins")) 

create_tile(read_freq, "Kgmm_HMM", paste("After HMM Smoothing |", BIN_SIZE_LABEL, "bins"))

All Methods Comparison

# Create a combined plot for all methods including HMM
read_freq %>%
  pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "genotype") %>%
  mutate(
    POS = BIN_POS * BIN_SIZE,
    method = factor(method, 
                   levels = c("K", "K_HMM", "Klog", "Klog_HMM", "Kasin", "Kasin_HMM", "Kgmm", "Kgmm_HMM"),
                   labels = c("K-means (weighted)", "K-means (weighted)\nHMM",
                              "K-means (log10)", "K-means (log10)\nHMM",
                              "K-means (arcsin)", "K-means (arcsin)\nHMM",
                              "Gaussian Mixture", "Gaussian Mixture]\nHMM"))
  ) %>%
  ggplot(aes(x = POS, y = 1, fill = genotype)) +
  geom_tile() +
  scale_fill_manual(values = pal) +
  facet_grid(method ~ CONTIG, scales = "free") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x = unit(0.0, "lines"),
    panel.spacing.y = unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Comparison of Clustering Methods with HMM Smoothing (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = "Genotype"
  )

Quantitative Method Assessment

Genotype Frequency Analysis

# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm, K_HMM, Klog_HMM, Kasin_HMM, Kgmm_HMM), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    count = n(),
    percentage = n() / nrow(read_freq) * 100,
    .groups = "drop"
  ) %>%
  mutate(
    method_type = ifelse(grepl("_HMM", method), "HMM Smoothed", "Original"),
    method_base = gsub("_HMM", "", method),
    method = case_when(
      method == "K" ~ "K-means (weighted)",
      method == "K_HMM" ~ "K-means (weighted) HMM",
      method == "Klog" ~ "K-means (log10)",
      method == "Klog_HMM" ~ "K-means (log10) HMM",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
      method == "Kgmm" ~ "Gaussian Mixture",
      method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
      TRUE ~ method
    )
  )

# Add expected frequencies
expectation <- data.frame(
  method = "Expected (BC2S3)",
  genotype = c("REF", "HET", "ALT"),
  count = round(bc2s3 * nrow(read_freq), 0),
  percentage = 100 * bc2s3,
  method_type = "Theoretical",
  method_base = "Expected"
)

# Combine and create frequency table
method_freq <- rbind(expectation, genotype_freq) %>%
  select(method, genotype, percentage) %>%
  pivot_wider(
    id_cols = genotype,
    names_from = method,
    values_from = percentage
  )

# Display frequency table
method_freq %>%
  knitr::kable(
    caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    digits = 1
  )
Genotype frequencies (%) across methods (1Mb bins)
genotype Expected (BC2S3) K-means (weighted) K-means (weighted) HMM K-means (arcsin) K-means (arcsin) HMM Gaussian Mixture Gaussian Mixture HMM K-means (log10) K-means (log10) HMM
REF 85.9 97.0 97.4 96.8 97.4 95.6 97.0 34.9 22.7
HET 3.1 1.8 1.1 1.9 1.1 1.7 0.5 61.8 74.1
ALT 10.9 1.2 1.5 1.4 1.5 2.6 2.5 3.3 3.2
# Visualize frequencies
rbind(expectation, genotype_freq) %>%
  ggplot(aes(x = reorder(method, percentage), y = percentage, fill = genotype)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = pal) +
  labs(
    title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Method",
    y = "Percentage (%)",
    fill = "Genotype"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  ) +
  coord_flip()

Method Agreement Analysis

# Calculate agreement between methods including HMM versions
methods <- c("K", "Klog", "Kasin", "Kgmm", "K_HMM", "Klog_HMM", "Kasin_HMM", "Kgmm_HMM")
agreement_matrix <- matrix(NA, length(methods), length(methods))
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods

for (i in 1:length(methods)) {
  for (j in 1:length(methods)) {
    agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
  }
}

# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)
Percentage agreement between methods
K Klog Kasin Kgmm K_HMM Klog_HMM Kasin_HMM Kgmm_HMM
K 100.0 36.1 99.7 97.3 98.8 23.9 98.8 98.2
Klog 36.1 100.0 36.3 38.2 36.3 74.1 36.3 37.2
Kasin 99.7 36.3 100.0 97.6 98.8 24.1 98.8 98.4
Kgmm 97.3 38.2 97.6 100.0 97.3 25.7 97.3 98.2
K_HMM 98.8 36.3 98.8 97.3 100.0 24.2 100.0 98.9
Klog_HMM 23.9 74.1 24.1 25.7 24.2 100.0 24.2 25.1
Kasin_HMM 98.8 36.3 98.8 97.3 100.0 24.2 100.0 98.9
Kgmm_HMM 98.2 37.2 98.4 98.2 98.9 25.1 98.9 100.0
# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
  tibble::rownames_to_column("Method1") %>%
  pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")

ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
  geom_tile() +
  geom_text(aes(label = round(Agreement, 1)), color = "white", size = 3) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Agreement Between Methods (%)",
    x = NULL,
    y = NULL,
    fill = "Agreement (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )

HMM Smoothing Effect Quantification

# Calculate how much each method changed after HMM smoothing
hmm_effects <- data.frame(
  method = c("K", "Klog", "Kasin", "Kgmm"),
  changes = c(
    mean(read_freq$K != read_freq$K_HMM) * 100,
    mean(read_freq$Klog != read_freq$Klog_HMM) * 100,
    mean(read_freq$Kasin != read_freq$Kasin_HMM) * 100,
    mean(read_freq$Kgmm != read_freq$Kgmm_HMM) * 100
  )
)

# Calculate direction of changes (transitions between states)
transition_summary <- data.frame()

for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  hmm_method <- paste0(method, "_HMM")
  
  transitions <- table(read_freq[[method]], read_freq[[hmm_method]])
  
  # Calculate off-diagonal (changed) proportions
  total_changes <- sum(transitions) - sum(diag(transitions))
  
  for (i in rownames(transitions)) {
    for (j in colnames(transitions)) {
      if (i != j && transitions[i, j] > 0) {
        transition_summary <- rbind(transition_summary, data.frame(
          method = method,
          from = i,
          to = j,
          count = transitions[i, j],
          percentage = transitions[i, j] / nrow(read_freq) * 100
        ))
      }
    }
  }
}

# Display HMM effects
cat("Percentage of bins changed by HMM smoothing:\n")
## Percentage of bins changed by HMM smoothing:
hmm_effects %>%
  mutate(method = case_when(
    method == "K" ~ "K-means (weighted)",
    method == "Klog" ~ "K-means (log10)",
    method == "Kasin" ~ "K-means (arcsin)",
    method == "Kgmm" ~ "Gaussian Mixture",
    TRUE ~ method
  )) %>%
  knitr::kable(col.names = c("Method", "% Bins Changed"), digits = 2)
Method % Bins Changed
K-means (weighted) 1.22
K-means (log10) 25.94
K-means (arcsin) 1.22
Gaussian Mixture 1.78
# Display transition summary
cat("\nDetailed transition patterns:\n")
## 
## Detailed transition patterns:
transition_summary %>%
  mutate(
    transition = paste(from, "→", to),
    method = case_when(
      method == "K" ~ "K-means (weighted)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  select(method, transition, count, percentage) %>%
  knitr::kable(digits = 2)
method transition count percentage
K-means (weighted) REF → HET 3 0.14
K-means (weighted) REF → ALT 2 0.09
K-means (weighted) HET → REF 13 0.61
K-means (weighted) HET → ALT 6 0.28
K-means (weighted) ALT → REF 2 0.09
K-means (log10) REF → HET 403 18.90
K-means (log10) REF → ALT 2 0.09
K-means (log10) HET → REF 143 6.71
K-means (log10) HET → ALT 1 0.05
K-means (log10) ALT → REF 1 0.05
K-means (log10) ALT → HET 3 0.14
K-means (arcsin) REF → HET 2 0.09
K-means (arcsin) REF → ALT 1 0.05
K-means (arcsin) HET → REF 15 0.70
K-means (arcsin) HET → ALT 5 0.23
K-means (arcsin) ALT → REF 2 0.09
K-means (arcsin) ALT → HET 1 0.05
Gaussian Mixture REF → ALT 2 0.09
Gaussian Mixture HET → REF 28 1.31
Gaussian Mixture HET → ALT 2 0.09
Gaussian Mixture ALT → REF 3 0.14
Gaussian Mixture ALT → HET 3 0.14
# Visualize HMM effects
ggplot(hmm_effects, aes(x = reorder(method, changes), y = changes)) +
  geom_col(fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(round(changes, 1), "%")), 
            hjust = -0.1, size = 4) +
  labs(
    title = "Effect of HMM Smoothing on Different Clustering Methods",
    x = "Clustering Method",
    y = "Percentage of Bins Changed (%)"
  ) +
  theme_minimal() +
  coord_flip()

Method Disagreement Analysis

# Analyze where methods disagree most
disagreement_data <- read_freq %>%
  mutate(
    POS = BIN_POS * BIN_SIZE,
    # Check if all original methods agree
    original_agreement = factor(ifelse(
      K == Klog & K == Kasin & K == Kgmm,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree")),
    # Check if all HMM methods agree
    hmm_agreement = factor(ifelse(
      K_HMM == Klog_HMM & K_HMM == Kasin_HMM & K_HMM == Kgmm_HMM,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree"))
  )

# Calculate disagreement statistics
original_disagreement <- mean(disagreement_data$original_agreement == "Methods disagree") * 100
hmm_disagreement <- mean(disagreement_data$hmm_agreement == "Methods disagree") * 100

cat("Original methods disagreement:", round(original_disagreement, 1), "%\n")
## Original methods disagreement: 64.1 %
cat("HMM smoothed methods disagreement:", round(hmm_disagreement, 1), "%\n")
## HMM smoothed methods disagreement: 75.9 %
cat("Reduction in disagreement:", round(original_disagreement - hmm_disagreement, 1), "percentage points\n")
## Reduction in disagreement: -11.8 percentage points
# Create tile plot showing disagreement regions
p1 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = original_agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  labs(title = "Original Methods Agreement/Disagreement", fill = NULL) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank(),
    panel.spacing.x = unit(0.0, "lines")
  )

p2 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = hmm_agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  labs(title = "HMM Smoothed Methods Agreement/Disagreement", 
       x = "Genome Position (Mb)", fill = NULL) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.spacing.x = unit(0.0, "lines")
  )

ggpubr::ggarrange(p1, p2, ncol = 1, common.legend = TRUE)

Conclusions and Recommendations

Based on this comprehensive analysis of ancestry calling methods with HMM smoothing:

Key Findings

  1. Method Performance: All clustering methods show reasonable performance, with the Gaussian Mixture Model and arcsin-transformed K-means showing slightly better separation of genotype classes.

  2. HMM Smoothing Benefits:

    • Reduces noise in ancestry calls by an average of 7.5% across methods
    • Improves agreement between methods by -11.8 percentage points
    • Enforces biological constraints based on breeding design expectations
  3. Genotype Frequency Alignment: HMM smoothing brings observed frequencies closer to theoretical BC2S3 expectations, particularly for ALT regions.

Recommendations

  1. Preferred Method: Use the Gaussian Mixture Model with HMM smoothing (Kgmm_HMM) for final ancestry calls, as it:
    • Shows good separation between genotype classes
    • Incorporates breeding design priors through HMM
    • Produces frequencies closest to theoretical expectations
  2. Quality Control:
    • Monitor bins with very low variant counts (< 5 variants) for reliability
    • Consider chromosome-specific analysis for regions with high disagreement
    • Validate results with independent markers or phenotypic data when available
  3. Parameter Optimization:
    • HMM transition probabilities (currently 0.995 stay, 0.005 switch) may need adjustment based on:
      • Linkage map resolution
      • Expected recombination frequency
      • Specific breeding design
  4. Bin Size Considerations:
    • Current 1Mb bins provide good balance between resolution and statistical power
    • Consider smaller bins (500kb) for fine-mapping or larger bins (5Mb) for genome-wide patterns

Future Directions

  1. Multi-sample Joint Calling: Implement joint HMM across multiple samples to leverage shared recombination patterns
  2. Copy Number Integration: Incorporate copy number variation detection to improve accuracy in duplicated regions
  3. Functional Annotation: Overlay ancestry calls with gene annotations to identify functionally relevant introgression regions

Session Information

# Record session information for reproducibility
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS 15.3.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1        ggpubr_0.6.0        scales_1.4.0       
## [4] tidyr_1.3.1         HMM_1.0.1           rebmix_2.16.0      
## [7] Ckmeans.1d.dp_4.3.5 ggplot2_3.5.2       dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     rstatix_0.7.2      digest_0.6.37     
##  [5] magrittr_2.0.3     evaluate_1.0.3     grid_4.4.1         RColorBrewer_1.1-3
##  [9] fastmap_1.2.0      jsonlite_2.0.0     backports_1.5.0    Formula_1.2-5     
## [13] gridExtra_2.3      purrr_1.0.4        jquerylib_0.1.4    abind_1.4-8       
## [17] Rdpack_2.6.4       cli_3.6.5          rlang_1.1.6        rbibutils_2.3     
## [21] cowplot_1.1.3      withr_3.0.2        cachem_1.1.0       yaml_2.3.10       
## [25] tools_4.4.1        ggsignif_0.6.4     broom_1.0.8        vctrs_0.6.5       
## [29] R6_2.6.1           lifecycle_1.0.4    car_3.1-3          pkgconfig_2.0.3   
## [33] pillar_1.10.2      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
## [37] Rcpp_1.0.14        xfun_0.52          tidyselect_1.2.1   rstudioapi_0.17.1 
## [41] knitr_1.50         dichromat_2.0-0.1  farver_2.1.2       htmltools_0.5.8.1 
## [45] labeling_0.4.3     rmarkdown_2.29     carData_3.0-5      compiler_4.4.1