This document outlines a genomic ancestry analysis pipeline for maize introgression lines. The analysis identifies genomic ancestry across bins of size 1Mb using the following approach:
# Load required libraries
library(dplyr)        # Data manipulation
library(ggplot2)      # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix)       # Gaussian mixture model
library(HMM)          # Hidden Markov Models
library(tidyr)        # Data reshaping
library(scales)       # Unit formatting for plots
library(ggpubr)       # Plot arrangement
library(tibble)       # Modern data frames# Read the binned allelic counts
read_freq <- read.table("~/Desktop/PN17_SID1632_bin_genotypes.tsv", header=TRUE)
# Display first few rows of the binned data
head(read_freq)##   CONTIG BIN_POS VARIANT_COUNT INFORMATIVE_VARIANT_COUNT DEPTH_SUM ALT_COUNT
## 1   chr1       1          8163                      3810      7355        34
## 2   chr1       2         13158                      5512     10174        20
## 3   chr1       3         14272                      6128     11390        25
## 4   chr1       4         11263                      5084      9273        16
## 5   chr1       5         12470                      5440     10589        38
## 6   chr1       6         10971                      4497      8514        21
##      ALT_FREQ BIN_START BIN_END ALT_FREQ_NORM GENOTYPE
## 1 0.004622706      5892  999990   0.002914504      REF
## 2 0.001965795   1000069 1999892   0.001997776      REF
## 3 0.002194908   2000252 2999995   0.002419468      REF
## 4 0.001725439   3000032 3989703   0.001500972      REF
## 5 0.003588630   4002810 4999974   0.003456319      REF
## 6 0.002466526   5000001 5997031   0.002090021      REF# Calculate summary statistics
IVC_mean <- mean(read_freq$INFORMATIVE_VARIANT_COUNT)
IVC_sd <- sd(read_freq$INFORMATIVE_VARIANT_COUNT)
# Display summary of bins
cat("Total number of bins:", nrow(read_freq), "\n")## Total number of bins: 2132## Average depth sum: 9686.165## Average number of informative variants per bin: 5223.613cat("Median number of informative variants per bin:", median(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")## Median number of informative variants per bin: 5093.5cat("Range of informative variants per bin:", min(read_freq$INFORMATIVE_VARIANT_COUNT), "to", max(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")## Range of informative variants per bin: 13 to 10052# Create weightesALT_FREQ 
read_freq <- read_freq %>%
  mutate(
    ALT_FREQ_WEIGHTED = ALT_FREQ * (INFORMATIVE_VARIANT_COUNT / IVC_mean)
  )
# Visualize the relationship between informative variant count and ALT_FREQ
ggplot(read_freq, aes(x = INFORMATIVE_VARIANT_COUNT, y = ALT_FREQ)) +
  geom_point(alpha = 0.6) +
  labs(title = paste("Relationship between informative variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
       x = "Number of informative variants in bin", 
       y = "Alternative allele frequency") +
  theme_minimal()# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
  theme_minimal()
p2 <- ggplot(read_freq, aes(x = ALT_FREQ_WEIGHTED)) +
  geom_histogram(bins = 30, fill = "tomato", color = "white") +
  labs(title = "Weighted ALT_FREQ distribution", x = "ALT_FREQ_WEIGHTED") +
  theme_minimal()
ggpubr::ggarrange(p1, p2, ncol = 2)# Check how many bins have ALT_FREQ = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_WEIGHTED == 0)
total_bins <- nrow(read_freq)
cat("Bins with ALT_FREQ_WEIGHTED = 0:", 
    zero_alt_bins, "out of", total_bins, 
    "(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")## Bins with ALT_FREQ_WEIGHTED = 0: 5 out of 2132 ( 0.2 %)# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
  filter(ALT_FREQ_WEIGHTED > 0)
cat("Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered):", 
    nrow(non_zero_bins), "out of", total_bins,
    "(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")## Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered): 2127 out of 2132 ( 99.8 %)# Create transformations for better clustering performance
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_WEIGHTED)
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_WEIGHTED))
# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_WEIGHTED)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = paste("Weighted ALT_FREQ distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""), 
       x = "ALT_FREQ_WEIGHTED") +
  theme_minimal()
p2 <- ggplot(non_zero_bins, aes(x = asin)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Arcsin transformation (non-zero bins)",
       x = "asin(sqrt(ALT_FREQ_WEIGHTED))") +
  theme_minimal()
p3 <- ggplot(non_zero_bins, aes(x = log)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Log10 transformation (non-zero bins)",
       x = "log10(ALT_FREQ_WEIGHTED)") +
  theme_minimal()
ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)K <- 3
# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_WEIGHTED, K)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, K)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, K)$cluster)
# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ
relabel_clusters <- function(clusters, data) {
  # Calculate mean ALT_FREQ for each cluster
  cluster_means <- tapply(data$ALT_FREQ, clusters, mean)
  
  # Order clusters by mean ALT_FREQ
  ordered_clusters <- order(cluster_means)
  
  # Create mapping from original cluster to REF, HET, ALT
  cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
  cluster_map[ordered_clusters[1]] <- "REF"
  cluster_map[ordered_clusters[2]] <- "HET"
  cluster_map[ordered_clusters[3]] <- "ALT"
  
  # Apply mapping to original clusters
  return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}
# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)
# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")## K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.969 0.018 0.012## 
## Log-transformed K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.347 0.620 0.033## 
## Arcsin-transformed K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.968 0.019 0.014# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
  Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ)),
  Preprocessing = "histogram",
  cmin = 3,  # Force 3 components
  cmax = 3,
  Criterion = "BIC",
  pdf = "normal"
)
# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)
# Relabel Kgmm clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)
# Check Kgmm cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")## Gaussian Mixture Model cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.956 0.017 0.026# Create new columns in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))
# Assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  # Create an index for non-zero bins in the original data
  non_zero_idx <- which(read_freq$ALT_FREQ_WEIGHTED > 0)
  
  # Assign cluster classifications to non-zero bins
  read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}
# Force bins with ALT_FREQ = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  read_freq[[method]][zero_idx] <- "REF"
}
# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")## Final cluster proportions for raw K-means:## 
##   REF   HET   ALT 
## 0.970 0.018 0.012For maize introgression lines produced through backcrossing followed by selfing (e.g., BC2S3), we can calculate expected genotype frequencies based on Mendelian genetics:
# Create mating matrices 
create_mating_matrices <- function() {
  # Backcross matrix for crossing population with AA donor
  # Each COLUMN represents offspring distribution from that parent × AA
  # Column 1: AA × AA → [1, 0, 0] (100% AA offspring)
  # Column 2: Aa × AA → [1/2, 1/2, 0] (50% AA, 50% Aa offspring)  
  # Column 3: aa × AA → [0, 1, 0] (100% Aa offspring)
  backcross_AA <- matrix(c(
    1, 1/2, 0,    
    0, 1/2, 1,    
    0,   0, 0     
  ), nrow = 3, byrow = TRUE)
  
  # Backcross matrix for crossing population with aa donor
  # Column 1: AA × aa → [0, 1, 0] (100% Aa offspring)
  # Column 2: Aa × aa → [0, 1/2, 1/2] (50% Aa, 50% aa offspring)
  # Column 3: aa × aa → [0, 0, 1] (100% aa offspring)
  backcross_aa <- matrix(c(
    0,   0, 0,    
    1, 1/2, 0,    
    0, 1/2, 1     
  ), nrow = 3, byrow = TRUE)
  
  # Selfing matrix - each individual self-fertilizes
  # Column 1: AA selfed → [1, 0, 0] (100% AA offspring)
  # Column 2: Aa selfed → [1/4, 1/2, 1/4] (25% AA, 50% Aa, 25% aa offspring)
  # Column 3: aa selfed → [0, 0, 1] (100% aa offspring)
  selfing <- matrix(c(
    1, 1/4, 0,    
    0, 1/2, 0,    
    0, 1/4, 1     
  ), nrow = 3, byrow = TRUE)
  
  return(list(
    backcross_AA = backcross_AA,
    backcross_aa = backcross_aa,
    selfing = selfing
  ))
}
# Function to calculate genotype frequencies after breeding scheme
calculate_nil_frequencies <- function(bc=2, s=3, donor_type = "aa", from = c(0,1,0) ) {
  # Input validation
  if (bc  < 0 || s < 0) {
    stop("Number of backcrosses and self generations must be non-negative")
  }
  if (!donor_type %in% c("AA", "aa")) {
    stop("Donor type must be 'AA' or 'aa'")
  }
  
  # Get mating matrices
  matrices <- create_mating_matrices()
  
  # Initial F1 generation - all heterozygous after crossing pure lines
  # Population vector: [AA frequency, Aa frequency, aa frequency]
  current_population <- matrix(from, ncol = 1)
  
  # Apply backcrosses sequentially
  if (bc > 0) {
    # Select appropriate backcross matrix based on donor type
    backcross_matrix <- if (donor_type == "aa") {
      matrices$backcross_AA
    } else {
      matrices$backcross_aa
    }
    
    # Each backcross: Matrix %*% population_vector
    # This takes weighted average of matrix columns based on current population
    for (i in 1:bc) {
      current_population <- backcross_matrix %*% current_population
      
      # Optional: print intermediate results for debugging
      if (exists("debug_nil") && debug_nil) {
        cat("After backcross", i, ": [", 
            round(current_population[1,1], 4), ", ",
            round(current_population[2,1], 4), ", ", 
            round(current_population[3,1], 4), "]\n")
      }
    }
  }
  
  # Apply selfing generations sequentially  
  if (s > 0) {
    # Each selfing: selfing_matrix %*% population_vector
    # Each column represents offspring from selfing that genotype
    for (i in 1:s) {
      current_population <- matrices$selfing %*% current_population
      
      # Optional: print intermediate results for debugging
      if (exists("debug_nil") && debug_nil) {
        cat("After self", i, ": [", 
            round(current_population[1,1], 4), ", ",
            round(current_population[2,1], 4), ", ", 
            round(current_population[3,1], 4), "]\n")
      }
    }
  }
  
  # Extract final frequencies as named vector
  final_frequencies <- as.vector(current_population)
  names(final_frequencies) <- c("AA", "Aa", "aa")
  
  return(final_frequencies)
}
# Function to return frequencies in HMM format (REF, HET, ALT)
nil_frequencies_for_hmm <- function(bc, s, donor_type = "aa", from = c(0,1,0)) {
  # Calculate raw genotype frequencies
  genotype_freqs <- calculate_nil_frequencies(bc, s, donor_type, from)
  
  # Convert to HMM format based on which allele is being introgressed
  if (donor_type == "AA") {
    # AA is the introgressed (donor) allele
    # aa represents the recurrent parent background (REF)
    # AA represents regions with donor allele (ALT)
    hmm_frequencies <- c(
      REF = genotype_freqs["aa"],  # Recurrent parent background
      HET = genotype_freqs["Aa"],  # Heterozygous regions  
      ALT = genotype_freqs["AA"]   # Donor introgression
    )
  } else {
    # aa is the introgressed (donor) allele
    # AA represents the recurrent parent background (REF)
    # aa represents regions with donor allele (ALT)
    hmm_frequencies <- c(
      REF = genotype_freqs["AA"],  # Recurrent parent background
      HET = genotype_freqs["Aa"],  # Heterozygous regions
      ALT = genotype_freqs["aa"]   # Donor introgression  
    )
  }
  names(hmm_frequencies) <- c("REF", "HET","ALT")
  return(hmm_frequencies)
}
bc2s3 <- nil_frequencies_for_hmm(bc=2,s=3)
cat("Expected genotype frequencies for BC2S3:\n")## Expected genotype frequencies for BC2S3:## REF: 0.8594## HET: 0.0312## ALT: 0.1094# Function to apply HMM smoothing to ancestry calls
smooth_ancestry_with_hmm <- function(genotypes, transitions = c(0.96, 0.04)) {
  # Convert genotypes to numeric (0=REF, 1=HET, 2=ALT)
  geno_numeric <- as.numeric(factor(genotypes, levels=c("REF", "HET", "ALT"))) - 1
  
  # Set up transition probabilities (high probability of staying in same state)
  trans_prob <- matrix(c(
    transitions[1], transitions[2]/2, transitions[2]/2,  # From REF
    transitions[2]/2, transitions[1], transitions[2]/2,  # From HET
    transitions[2]/2, transitions[2]/2, transitions[1]   # From ALT
  ), nrow=3, byrow=TRUE)
  
  # Set up emission probabilities (how likely each state produces observed genotypes)
  emiss_prob <- matrix(c(
    0.9, 0.08, 0.02,  # REF state
    0.1, 0.8, 0.1,    # HET state
    0.02, 0.08, 0.9   # ALT state
  ), nrow=3, byrow=TRUE)
  
  # Initialize HMM with BC2S3 priors
  hmm <- initHMM(c("REF", "HET", "ALT"), c("0", "1", "2"), 
                startProbs = bc2s3,  # Prior probabilities from breeding design
                transProbs = trans_prob, 
                emissionProbs = emiss_prob)
  
  # Run Viterbi algorithm to find most likely state sequence
  viterbi_path <- viterbi(hmm, as.character(geno_numeric))
  
  # Convert back to genotype calls
  smoothed_genotypes <- factor(viterbi_path, levels=c("REF", "HET", "ALT"))
  
  return(smoothed_genotypes)
}# Apply HMM smoothing to each chromosome separately
read_freq <- read_freq %>%
  group_by(CONTIG) %>%
  mutate(
    K_HMM = smooth_ancestry_with_hmm(K, transitions=c(0.995, 0.005)),
    Kasin_HMM = smooth_ancestry_with_hmm(Kasin, transitions=c(0.995, 0.005)),
    Klog_HMM = smooth_ancestry_with_hmm(Klog, transitions=c(0.995, 0.005)),
    Kgmm_HMM = smooth_ancestry_with_hmm(Kgmm, transitions=c(0.995, 0.005))
  ) %>%
  ungroup()
# Ensure chromosome factor ordering
read_freq$CONTIG <- factor(read_freq$CONTIG, levels = paste0("chr", 1:10))# Define color palette
pal <- c("REF" = "gold", "HET" = "springgreen4", "ALT" = "purple4")
# Create tile plot function
create_tile <- function(data, method_col, title) {
  data %>%
    mutate(
      CONTIG = factor(CONTIG, levels = paste0("chr", 1:10)),
      POS = BIN_POS * BIN_SIZE
    ) %>%
    ggplot(aes(x = POS, y = 1, fill = .data[[method_col]])) +
    geom_tile() +
    facet_wrap( .~CONTIG, ncol = 1, strip.position = "left") +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
    scale_fill_manual(values = pal) +
    labs(title = title, x = "Position (bp)", fill = "Genotype") +
    ggpubr::theme_classic2(base_size = 10) +
    theme(
      legend.position = "bottom",
      strip.background = element_blank(),
      strip.text.y.left = element_text(angle = 0, hjust = 0),
      panel.spacing.y = unit(0.0, "lines"),
      axis.line.y = element_blank(),
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank()
    ) 
}# Compare before and after HMM smoothing
create_tile(read_freq, "Kgmm", paste("Before HMM Smoothing |", BIN_SIZE_LABEL, "bins")) # Create a combined plot for all methods including HMM
read_freq %>%
  pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "genotype") %>%
  mutate(
    POS = BIN_POS * BIN_SIZE,
    method = factor(method, 
                   levels = c("K", "K_HMM", "Klog", "Klog_HMM", "Kasin", "Kasin_HMM", "Kgmm", "Kgmm_HMM"),
                   labels = c("K-means (weighted)", "K-means (weighted)\nHMM",
                              "K-means (log10)", "K-means (log10)\nHMM",
                              "K-means (arcsin)", "K-means (arcsin)\nHMM",
                              "Gaussian Mixture", "Gaussian Mixture]\nHMM"))
  ) %>%
  ggplot(aes(x = POS, y = 1, fill = genotype)) +
  geom_tile() +
  scale_fill_manual(values = pal) +
  facet_grid(method ~ CONTIG, scales = "free") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x = unit(0.0, "lines"),
    panel.spacing.y = unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Comparison of Clustering Methods with HMM Smoothing (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = "Genotype"
  )# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm, K_HMM, Klog_HMM, Kasin_HMM, Kgmm_HMM), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    count = n(),
    percentage = n() / nrow(read_freq) * 100,
    .groups = "drop"
  ) %>%
  mutate(
    method_type = ifelse(grepl("_HMM", method), "HMM Smoothed", "Original"),
    method_base = gsub("_HMM", "", method),
    method = case_when(
      method == "K" ~ "K-means (weighted)",
      method == "K_HMM" ~ "K-means (weighted) HMM",
      method == "Klog" ~ "K-means (log10)",
      method == "Klog_HMM" ~ "K-means (log10) HMM",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
      method == "Kgmm" ~ "Gaussian Mixture",
      method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
      TRUE ~ method
    )
  )
# Add expected frequencies
expectation <- data.frame(
  method = "Expected (BC2S3)",
  genotype = c("REF", "HET", "ALT"),
  count = round(bc2s3 * nrow(read_freq), 0),
  percentage = 100 * bc2s3,
  method_type = "Theoretical",
  method_base = "Expected"
)
# Combine and create frequency table
method_freq <- rbind(expectation, genotype_freq) %>%
  select(method, genotype, percentage) %>%
  pivot_wider(
    id_cols = genotype,
    names_from = method,
    values_from = percentage
  )
# Display frequency table
method_freq %>%
  knitr::kable(
    caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    digits = 1
  )| genotype | Expected (BC2S3) | K-means (weighted) | K-means (weighted) HMM | K-means (arcsin) | K-means (arcsin) HMM | Gaussian Mixture | Gaussian Mixture HMM | K-means (log10) | K-means (log10) HMM | 
|---|---|---|---|---|---|---|---|---|---|
| REF | 85.9 | 97.0 | 97.4 | 96.8 | 97.4 | 95.6 | 97.0 | 34.9 | 22.7 | 
| HET | 3.1 | 1.8 | 1.1 | 1.9 | 1.1 | 1.7 | 0.5 | 61.8 | 74.1 | 
| ALT | 10.9 | 1.2 | 1.5 | 1.4 | 1.5 | 2.6 | 2.5 | 3.3 | 3.2 | 
# Visualize frequencies
rbind(expectation, genotype_freq) %>%
  ggplot(aes(x = reorder(method, percentage), y = percentage, fill = genotype)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = pal) +
  labs(
    title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Method",
    y = "Percentage (%)",
    fill = "Genotype"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  ) +
  coord_flip()# Calculate agreement between methods including HMM versions
methods <- c("K", "Klog", "Kasin", "Kgmm", "K_HMM", "Klog_HMM", "Kasin_HMM", "Kgmm_HMM")
agreement_matrix <- matrix(NA, length(methods), length(methods))
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods
for (i in 1:length(methods)) {
  for (j in 1:length(methods)) {
    agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
  }
}
# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)| K | Klog | Kasin | Kgmm | K_HMM | Klog_HMM | Kasin_HMM | Kgmm_HMM | |
|---|---|---|---|---|---|---|---|---|
| K | 100.0 | 36.1 | 99.7 | 97.3 | 98.8 | 23.9 | 98.8 | 98.2 | 
| Klog | 36.1 | 100.0 | 36.3 | 38.2 | 36.3 | 74.1 | 36.3 | 37.2 | 
| Kasin | 99.7 | 36.3 | 100.0 | 97.6 | 98.8 | 24.1 | 98.8 | 98.4 | 
| Kgmm | 97.3 | 38.2 | 97.6 | 100.0 | 97.3 | 25.7 | 97.3 | 98.2 | 
| K_HMM | 98.8 | 36.3 | 98.8 | 97.3 | 100.0 | 24.2 | 100.0 | 98.9 | 
| Klog_HMM | 23.9 | 74.1 | 24.1 | 25.7 | 24.2 | 100.0 | 24.2 | 25.1 | 
| Kasin_HMM | 98.8 | 36.3 | 98.8 | 97.3 | 100.0 | 24.2 | 100.0 | 98.9 | 
| Kgmm_HMM | 98.2 | 37.2 | 98.4 | 98.2 | 98.9 | 25.1 | 98.9 | 100.0 | 
# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
  tibble::rownames_to_column("Method1") %>%
  pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")
ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
  geom_tile() +
  geom_text(aes(label = round(Agreement, 1)), color = "white", size = 3) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Agreement Between Methods (%)",
    x = NULL,
    y = NULL,
    fill = "Agreement (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )# Calculate how much each method changed after HMM smoothing
hmm_effects <- data.frame(
  method = c("K", "Klog", "Kasin", "Kgmm"),
  changes = c(
    mean(read_freq$K != read_freq$K_HMM) * 100,
    mean(read_freq$Klog != read_freq$Klog_HMM) * 100,
    mean(read_freq$Kasin != read_freq$Kasin_HMM) * 100,
    mean(read_freq$Kgmm != read_freq$Kgmm_HMM) * 100
  )
)
# Calculate direction of changes (transitions between states)
transition_summary <- data.frame()
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  hmm_method <- paste0(method, "_HMM")
  
  transitions <- table(read_freq[[method]], read_freq[[hmm_method]])
  
  # Calculate off-diagonal (changed) proportions
  total_changes <- sum(transitions) - sum(diag(transitions))
  
  for (i in rownames(transitions)) {
    for (j in colnames(transitions)) {
      if (i != j && transitions[i, j] > 0) {
        transition_summary <- rbind(transition_summary, data.frame(
          method = method,
          from = i,
          to = j,
          count = transitions[i, j],
          percentage = transitions[i, j] / nrow(read_freq) * 100
        ))
      }
    }
  }
}
# Display HMM effects
cat("Percentage of bins changed by HMM smoothing:\n")## Percentage of bins changed by HMM smoothing:hmm_effects %>%
  mutate(method = case_when(
    method == "K" ~ "K-means (weighted)",
    method == "Klog" ~ "K-means (log10)",
    method == "Kasin" ~ "K-means (arcsin)",
    method == "Kgmm" ~ "Gaussian Mixture",
    TRUE ~ method
  )) %>%
  knitr::kable(col.names = c("Method", "% Bins Changed"), digits = 2)| Method | % Bins Changed | 
|---|---|
| K-means (weighted) | 1.22 | 
| K-means (log10) | 25.94 | 
| K-means (arcsin) | 1.22 | 
| Gaussian Mixture | 1.78 | 
## 
## Detailed transition patterns:transition_summary %>%
  mutate(
    transition = paste(from, "â", to),
    method = case_when(
      method == "K" ~ "K-means (weighted)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  select(method, transition, count, percentage) %>%
  knitr::kable(digits = 2)| method | transition | count | percentage | 
|---|---|---|---|
| K-means (weighted) | REF â HET | 3 | 0.14 | 
| K-means (weighted) | REF â ALT | 2 | 0.09 | 
| K-means (weighted) | HET â REF | 13 | 0.61 | 
| K-means (weighted) | HET â ALT | 6 | 0.28 | 
| K-means (weighted) | ALT â REF | 2 | 0.09 | 
| K-means (log10) | REF â HET | 403 | 18.90 | 
| K-means (log10) | REF â ALT | 2 | 0.09 | 
| K-means (log10) | HET â REF | 143 | 6.71 | 
| K-means (log10) | HET â ALT | 1 | 0.05 | 
| K-means (log10) | ALT â REF | 1 | 0.05 | 
| K-means (log10) | ALT â HET | 3 | 0.14 | 
| K-means (arcsin) | REF â HET | 2 | 0.09 | 
| K-means (arcsin) | REF â ALT | 1 | 0.05 | 
| K-means (arcsin) | HET â REF | 15 | 0.70 | 
| K-means (arcsin) | HET â ALT | 5 | 0.23 | 
| K-means (arcsin) | ALT â REF | 2 | 0.09 | 
| K-means (arcsin) | ALT â HET | 1 | 0.05 | 
| Gaussian Mixture | REF â ALT | 2 | 0.09 | 
| Gaussian Mixture | HET â REF | 28 | 1.31 | 
| Gaussian Mixture | HET â ALT | 2 | 0.09 | 
| Gaussian Mixture | ALT â REF | 3 | 0.14 | 
| Gaussian Mixture | ALT â HET | 3 | 0.14 | 
# Visualize HMM effects
ggplot(hmm_effects, aes(x = reorder(method, changes), y = changes)) +
  geom_col(fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(round(changes, 1), "%")), 
            hjust = -0.1, size = 4) +
  labs(
    title = "Effect of HMM Smoothing on Different Clustering Methods",
    x = "Clustering Method",
    y = "Percentage of Bins Changed (%)"
  ) +
  theme_minimal() +
  coord_flip()# Analyze where methods disagree most
disagreement_data <- read_freq %>%
  mutate(
    POS = BIN_POS * BIN_SIZE,
    # Check if all original methods agree
    original_agreement = factor(ifelse(
      K == Klog & K == Kasin & K == Kgmm,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree")),
    # Check if all HMM methods agree
    hmm_agreement = factor(ifelse(
      K_HMM == Klog_HMM & K_HMM == Kasin_HMM & K_HMM == Kgmm_HMM,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree"))
  )
# Calculate disagreement statistics
original_disagreement <- mean(disagreement_data$original_agreement == "Methods disagree") * 100
hmm_disagreement <- mean(disagreement_data$hmm_agreement == "Methods disagree") * 100
cat("Original methods disagreement:", round(original_disagreement, 1), "%\n")## Original methods disagreement: 64.1 %## HMM smoothed methods disagreement: 75.9 %cat("Reduction in disagreement:", round(original_disagreement - hmm_disagreement, 1), "percentage points\n")## Reduction in disagreement: -11.8 percentage points# Create tile plot showing disagreement regions
p1 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = original_agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  labs(title = "Original Methods Agreement/Disagreement", fill = NULL) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank(),
    panel.spacing.x = unit(0.0, "lines")
  )
p2 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = hmm_agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  labs(title = "HMM Smoothed Methods Agreement/Disagreement", 
       x = "Genome Position (Mb)", fill = NULL) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.spacing.x = unit(0.0, "lines")
  )
ggpubr::ggarrange(p1, p2, ncol = 1, common.legend = TRUE)Based on this comprehensive analysis of ancestry calling methods with HMM smoothing:
Method Performance: All clustering methods show reasonable performance, with the Gaussian Mixture Model and arcsin-transformed K-means showing slightly better separation of genotype classes.
HMM Smoothing Benefits:
Genotype Frequency Alignment: HMM smoothing brings observed frequencies closer to theoretical BC2S3 expectations, particularly for ALT regions.
Kgmm_HMM) for final
ancestry calls, as it:
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS 15.3.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1        ggpubr_0.6.0        scales_1.4.0       
## [4] tidyr_1.3.1         HMM_1.0.1           rebmix_2.16.0      
## [7] Ckmeans.1d.dp_4.3.5 ggplot2_3.5.2       dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     rstatix_0.7.2      digest_0.6.37     
##  [5] magrittr_2.0.3     evaluate_1.0.3     grid_4.4.1         RColorBrewer_1.1-3
##  [9] fastmap_1.2.0      jsonlite_2.0.0     backports_1.5.0    Formula_1.2-5     
## [13] gridExtra_2.3      purrr_1.0.4        jquerylib_0.1.4    abind_1.4-8       
## [17] Rdpack_2.6.4       cli_3.6.5          rlang_1.1.6        rbibutils_2.3     
## [21] cowplot_1.1.3      withr_3.0.2        cachem_1.1.0       yaml_2.3.10       
## [25] tools_4.4.1        ggsignif_0.6.4     broom_1.0.8        vctrs_0.6.5       
## [29] R6_2.6.1           lifecycle_1.0.4    car_3.1-3          pkgconfig_2.0.3   
## [33] pillar_1.10.2      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
## [37] Rcpp_1.0.14        xfun_0.52          tidyselect_1.2.1   rstudioapi_0.17.1 
## [41] knitr_1.50         dichromat_2.0-0.1  farver_2.1.2       htmltools_0.5.8.1 
## [45] labeling_0.4.3     rmarkdown_2.29     carData_3.0-5      compiler_4.4.1