This document outlines a genomic ancestry analysis pipeline for maize introgression lines. The analysis identifies genomic ancestry across bins of size 1Mb using the following approach:
# Load required libraries
library(dplyr)        # Data manipulation
library(ggplot2)      # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix)       # Gaussian mixture model
library(HMM)          # Hidden Markov Models
library(tidyr)        # Data reshaping
library(scales)       # Unit formatting for plots
library(ggpubr)       # Plot arrangement
library(tibble)       # Modern data frames# Read the binned allelic counts
read_freq <- read.table("~/Desktop/PN17_SID1632_bin_genotypes.tsv", header=TRUE)
# Display first few rows of the binned data
head(read_freq)##   CONTIG BIN_POS VARIANT_COUNT INFORMATIVE_VARIANT_COUNT DEPTH_SUM ALT_COUNT
## 1   chr1       1          8163                      3810      7355        34
## 2   chr1       2         13158                      5512     10174        20
## 3   chr1       3         14272                      6128     11390        25
## 4   chr1       4         11263                      5084      9273        16
## 5   chr1       5         12470                      5440     10589        38
## 6   chr1       6         10971                      4497      8514        21
##      ALT_FREQ BIN_START BIN_END ALT_FREQ_NORM GENOTYPE
## 1 0.004622706      5892  999990   0.002914504      REF
## 2 0.001965795   1000069 1999892   0.001997776      REF
## 3 0.002194908   2000252 2999995   0.002419468      REF
## 4 0.001725439   3000032 3989703   0.001500972      REF
## 5 0.003588630   4002810 4999974   0.003456319      REF
## 6 0.002466526   5000001 5997031   0.002090021      REF# Calculate summary statistics
IVC_mean <- mean(read_freq$INFORMATIVE_VARIANT_COUNT)
IVC_sd <- sd(read_freq$INFORMATIVE_VARIANT_COUNT)
# Display summary of bins
cat("Total number of bins:", nrow(read_freq), "\n")## Total number of bins: 2132## Average depth sum: 9686.165## Average number of informative variants per bin: 5223.613cat("Median number of informative variants per bin:", median(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")## Median number of informative variants per bin: 5093.5cat("Range of informative variants per bin:", min(read_freq$INFORMATIVE_VARIANT_COUNT), "to", max(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")## Range of informative variants per bin: 13 to 10052# Create weightesALT_FREQ 
read_freq <- read_freq %>%
  mutate(
    ALT_FREQ_WEIGHTED = ALT_FREQ * (INFORMATIVE_VARIANT_COUNT / IVC_mean)
  )
# Visualize the relationship between informative variant count and ALT_FREQ
ggplot(read_freq, aes(x = INFORMATIVE_VARIANT_COUNT, y = ALT_FREQ)) +
  geom_point(alpha = 0.6) +
  labs(title = paste("Relationship between informative variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
       x = "Number of informative variants in bin", 
       y = "Alternative allele frequency") +
  theme_minimal()# Compare original vs weighted frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
  theme_minimal()
p2 <- ggplot(read_freq, aes(x = ALT_FREQ_WEIGHTED)) +
  geom_histogram(bins = 30, fill = "tomato", color = "white") +
  labs(title = "Weighted ALT_FREQ distribution", x = "ALT_FREQ_WEIGHTED") +
  theme_minimal()
ggpubr::ggarrange(p1, p2, ncol = 2)# Check how many bins have ALT_FREQ = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ == 0)
total_bins <- nrow(read_freq)
cat("Bins with ALT_FREQ_WEIGHTED = 0:", 
    zero_alt_bins, "out of", total_bins, 
    "(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")## Bins with ALT_FREQ_WEIGHTED = 0: 5 out of 2132 ( 0.2 %)# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
  filter(ALT_FREQ_WEIGHTED > 0)
cat("Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered):", 
    nrow(non_zero_bins), "out of", total_bins,
    "(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")## Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered): 2127 out of 2132 ( 99.8 %)# Create transformations for better clustering performance
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ))
# Visualize distributions of the weighted and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = paste("Weighted ALT_FREQ distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""), 
       x = "ALT_FREQ_WEIGHTED") +
  theme_minimal()
p2 <- ggplot(non_zero_bins, aes(x = asin)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Arcsin transformation (non-zero bins)",
       x = "asin(sqrt(ALT_FREQ_WEIGHTED))") +
  theme_minimal()
ggpubr::ggarrange(p1, p2, common.legend = TRUE)K <- 3
# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ, K)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, K)$cluster)
# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ
relabel_clusters <- function(clusters, data) {
  # Calculate mean ALT_FREQ for each cluster
  cluster_means <- tapply(data$ALT_FREQ, clusters, mean)
  
  # Order clusters by mean ALT_FREQ
  ordered_clusters <- order(cluster_means)
  
  # Create mapping from original cluster to REF, HET, ALT
  cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
  cluster_map[ordered_clusters[1]] <- "REF"
  cluster_map[ordered_clusters[2]] <- "HET"
  cluster_map[ordered_clusters[3]] <- "ALT"
  
  # Apply mapping to original clusters
  return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}
# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)
# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")## K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.973 0.015 0.012## 
## Arcsin-transformed K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.967 0.019 0.014# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
  Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ)),
  Preprocessing = "histogram",
  cmin = 3,  # Force 3 components
  cmax = 3,
  Criterion = "BIC",
  pdf = "normal"
)
# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)
# Relabel Kgmm clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)
# Check Kgmm cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")## Gaussian Mixture Model cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.956 0.017 0.026# Create new columns in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))
# Assign the non-zero bin clustering results to the main dataset
for (method in c("K","Kasin", "Kgmm")) {
  # Create an index for non-zero bins in the original data
  non_zero_idx <- which(read_freq$ALT_FREQ_WEIGHTED > 0)
  
  # Assign cluster classifications to non-zero bins
  read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}
# Force bins with ALT_FREQ = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ == 0)
for (method in c("K", "Kasin", "Kgmm")) {
  read_freq[[method]][zero_idx] <- "REF"
}
# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")## Final cluster proportions for raw K-means:## 
##   REF   HET   ALT 
## 0.973 0.015 0.012To compare the different clustering methods, we visualize the data in several ways:
# Define color palette
pal <- c("REF" = "gold", "HET" = "springgreen4", "ALT" = "purple4")
# Create a function for scatter plots
create_scatter <- function(data, method_col, title) {
  data %>%
    mutate(POS = BIN_POS * BIN_SIZE) %>%
    ggplot(aes(x = POS, y = ALT_FREQ, col = .data[[method_col]])) +
        labs(title = title, x = "Position (Mb)", y = "ALT_FREQ", color = "Genotype") +
    geom_point(size = 1, alpha = 0.7) +
    scale_color_manual(values = pal) +
    facet_wrap(.~CONTIG,ncol=10, scales = "free_x", strip.position = "bottom") +
    theme_bw() +
    scale_x_continuous(labels = unit_format(unit = "", scale = 1e-6))
}
read_freq$CONTIG <-factor(read_freq$CONTIG, levels= paste0("chr",1:10))
  
# Create scatter plots for each method
p1 <- create_scatter(read_freq, "K", paste("K-means on  ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p2 <- create_scatter(read_freq, "Kasin", paste("K-means on arcsin sqrt ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p3 <- create_scatter(read_freq, "Kgmm", paste("Gaussian mixture model on ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
# Display all scatter plots
ggpubr::ggarrange(p1, p2, p3, ncol = 1, common.legend = TRUE)For maize introgression lines produced through backcrossing followed by selfing (e.g., BC2S3), we can calculate expected genotype frequencies based on Mendelian genetics:
# Create mating matrices 
create_mating_matrices <- function() {
  # Backcross matrix for crossing population with AA donor
  # Each COLUMN represents offspring distribution from that parent × AA
  # Column 1: AA × AA → [1, 0, 0] (100% AA offspring)
  # Column 2: Aa × AA → [1/2, 1/2, 0] (50% AA, 50% Aa offspring)  
  # Column 3: aa × AA → [0, 1, 0] (100% Aa offspring)
  backcross_AA <- matrix(c(
    1, 1/2, 0,    
    0, 1/2, 1,    
    0,   0, 0     
  ), nrow = 3, byrow = TRUE)
  
  # Backcross matrix for crossing population with aa donor
  # Column 1: AA × aa → [0, 1, 0] (100% Aa offspring)
  # Column 2: Aa × aa → [0, 1/2, 1/2] (50% Aa, 50% aa offspring)
  # Column 3: aa × aa → [0, 0, 1] (100% aa offspring)
  backcross_aa <- matrix(c(
    0,   0, 0,    
    1, 1/2, 0,    
    0, 1/2, 1     
  ), nrow = 3, byrow = TRUE)
  
  # Selfing matrix - each individual self-fertilizes
  # Column 1: AA selfed → [1, 0, 0] (100% AA offspring)
  # Column 2: Aa selfed → [1/4, 1/2, 1/4] (25% AA, 50% Aa, 25% aa offspring)
  # Column 3: aa selfed → [0, 0, 1] (100% aa offspring)
  selfing <- matrix(c(
    1, 1/4, 0,    
    0, 1/2, 0,    
    0, 1/4, 1     
  ), nrow = 3, byrow = TRUE)
  
  return(list(
    backcross_AA = backcross_AA,
    backcross_aa = backcross_aa,
    selfing = selfing
  ))
}
# Function to calculate genotype frequencies after breeding scheme
calculate_nil_frequencies <- function(
    bc=2, s=3, 
    donor  = "aa",                      # the recurrent parent is AA
    crossing_pop = c(AA=0, Aa=1,aa=0),  # By default the F1 progeny of AA x aa
                                        # 100 % hetereozygous Aa
    debug_nil = FALSE 
    ) {
  # Input validation
  if (bc  < 0 || s < 0) {
    stop("Number of backcrosses and self generations must be non-negative")
  }
  if (!donor %in% c("AA", "aa")) {
    stop("Donor type must be 'AA' or 'aa'")
  }
  
  # Get mating matrices
  matrices <- create_mating_matrices()
  
  # Initial F1 generation - all heterozygous after crossing pure lines
  # Population vector: [AA frequency, Aa frequency, aa frequency]
  current_population <- matrix(crossing_pop, ncol = 1)
  
  # Apply backcrosses sequentially
  if (bc > 0) {
    # Select appropriate backcross matrix based on donor type
    backcross_matrix <- if (donor == "aa") {
      matrices$backcross_AA
    } else {
      matrices$backcross_aa
    }
    
    # Each backcross: Matrix %*% population_vector
    # This takes weighted average of matrix columns based on current population
    for (i in 1:bc) {
      current_population <- backcross_matrix %*% current_population
      
      # Optional: print intermediate results for debugging
      if (debug_nil) {
        cat("Genotype frequencies : [AA, Aa, aa]\n") 
        cat("After backcross", i, ": [", 
            round(current_population[1,1], 4), ", ",
            round(current_population[2,1], 4), ", ", 
            round(current_population[3,1], 4), "]\n")
      }
    }
  }
  
  # Apply selfing generations sequentially  
  if (s > 0) {
    # Each selfing: selfing_matrix %*% population_vector
    # Each column represents offspring from selfing that genotype
    for (i in 1:s) {
      current_population <- matrices$selfing %*% current_population
      
      # Optional: print intermediate results for debugging
      if (!is.null(debug_nil) & debug_nil) {
        cat("After self", i, ": [", 
            round(current_population[1,1], 4), ", ",
            round(current_population[2,1], 4), ", ", 
            round(current_population[3,1], 4), "]\n")
      }
    }
  }
  
  # Extract final frequencies as named vector
  final_frequencies <- as.vector(current_population)
  names(final_frequencies) <- c("AA", "Aa", "aa")
  
  return(final_frequencies)
}
# Function to return frequencies in HMM format (REF, HET, ALT)
nil_frequencies_for_hmm <- function(
    bc=2, s=3, 
    donor = "aa",        # the recurrent parent is AA
    crossing_pop = c(AA=0,Aa=1,aa=0),  # By default the F1 progeny of AA x aa, 
                                      # 100 % hetereozygous Aa
    ...
    ) {
  # Calculate raw genotype frequencies
  genotype_freqs <- calculate_nil_frequencies(bc, s, donor, crossing_pop,...)
  
  # Convert to HMM format based on which allele is being introgressed
  if (donor == "AA") {
    # A is the introgressed (donor) allele
    # aa represents the recurrent parent background (REF)
    # AA represents regions with donor allele (ALT)
    hmm_frequencies <- c(
      REF = genotype_freqs["aa"],  # Recurrent parent background
      HET = genotype_freqs["Aa"],  # Heterozygous regions  
      ALT = genotype_freqs["AA"]   # Donor introgression
    )
  } else {
    # a is the introgressed (donor) allele
    # AA represents the recurrent parent background (REF)
    # aa represents regions with donor allele (ALT)
    hmm_frequencies <- c(
      REF = genotype_freqs["AA"],  # Recurrent parent background
      HET = genotype_freqs["Aa"],  # Heterozygous regions
      ALT = genotype_freqs["aa"]   # Donor introgression  
    )
  }
  names(hmm_frequencies) <- c("REF", "HET","ALT")
  return(hmm_frequencies)
}
bc2s3 <- nil_frequencies_for_hmm(bc=2, s=3, donor = "aa", debug_nil= TRUE)## Genotype frequencies : [AA, Aa, aa]
## After backcross 1 : [ 0.5 ,  0.5 ,  0 ]
## Genotype frequencies : [AA, Aa, aa]
## After backcross 2 : [ 0.75 ,  0.25 ,  0 ]
## After self 1 : [ 0.8125 ,  0.125 ,  0.0625 ]
## After self 2 : [ 0.8438 ,  0.0625 ,  0.0938 ]
## After self 3 : [ 0.8594 ,  0.0312 ,  0.1094 ]## Expected genotype frequencies for BC2S3:##  REF  HET  ALT 
## 0.86 0.03 0.11# Function to apply HMM smoothing to ancestry calls
smooth_ancestry_with_hmm <- function(
    genotypes, 
    transitions = c(0.99, 0.01) # 1Mb ~ 1cM, you could add genetic map to this.
    ) {
  # Convert genotypes to numeric (0=REF, 1=HET, 2=ALT)
  geno_numeric <- as.numeric(factor(genotypes, levels=c("REF", "HET", "ALT"))) - 1
  
  # Set up transition probabilities (high probability of staying in same state)
  trans_prob <- matrix(c(
    transitions[1], transitions[2]/2, transitions[2]/2,  # From REF
    transitions[2]/2, transitions[1], transitions[2]/2,  # From HET
    transitions[2]/2, transitions[2]/2, transitions[1]   # From ALT
  ), nrow=3, byrow=TRUE)
  
  # Set up emission probabilities (how likely each state produces observed genotypes)
  emiss_prob <- matrix(c(
    0.9, 0.08, 0.02,  # REF state
    0.1, 0.8, 0.1,    # HET state
    0.02, 0.08, 0.9   # ALT state
  ), nrow=3, byrow=TRUE)
  
  # Initialize HMM with BC2S3 priors
  hmm <- initHMM(c("REF", "HET", "ALT"), c("0", "1", "2"), 
                startProbs = bc2s3,  # Prior probabilities from breeding design
                transProbs = trans_prob, 
                emissionProbs = emiss_prob)
  
  # Run Viterbi algorithm to find most likely state sequence
  viterbi_path <- viterbi(hmm, as.character(geno_numeric))
  
  # Convert back to genotype calls
  smoothed_genotypes <- factor(viterbi_path, levels=c("REF", "HET", "ALT"))
  
  return(smoothed_genotypes)
}# Apply HMM smoothing to each chromosome separately
read_freq <- read_freq %>%
  group_by(CONTIG) %>%
  mutate(
    K_HMM = smooth_ancestry_with_hmm(K, transitions=c(0.995, 0.005)),
    Kasin_HMM = smooth_ancestry_with_hmm(Kasin, transitions=c(0.995, 0.005)),
    Kgmm_HMM = smooth_ancestry_with_hmm(Kgmm, transitions=c(0.995, 0.005))
  ) %>%
  ungroup()
# Ensure chromosome factor ordering
read_freq$CONTIG <- factor(read_freq$CONTIG, levels = paste0("chr", 1:10))# Create tile plot function
create_tile <- function(data, method_col, title) {
  data %>%
    mutate(
      CONTIG = factor(CONTIG, levels = paste0("chr", 1:10)),
      POS = BIN_POS * BIN_SIZE
    ) %>%
    ggplot(aes(x = POS, y = 1, fill = .data[[method_col]])) +
    geom_tile() +
    facet_wrap( .~CONTIG, ncol = 1, strip.position = "left") +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
    scale_fill_manual(values = pal) +
    labs(title = title, x = "Position (bp)", fill = "Genotype") +
    ggpubr::theme_classic2(base_size = 10) +
    theme(
      legend.position = "bottom",
      strip.background = element_blank(),
      strip.text.y.left = element_text(angle = 0, hjust = 0),
      panel.spacing.y = unit(0.0, "lines"),
      axis.line.y = element_blank(),
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank()
    ) 
}# Compare before and after HMM smoothing
create_tile(read_freq, "Kgmm", paste("Before HMM Smoothing |", BIN_SIZE_LABEL, "bins")) # Create a combined plot for all methods including HMM
read_freq %>%
  pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "genotype") %>%
  mutate(
    POS = BIN_POS * BIN_SIZE,
    method = factor(method, 
                   levels = c("K", "K_HMM", "Kasin", "Kasin_HMM", "Kgmm", "Kgmm_HMM"),
                   labels = c("K-means", "K-means\nHMM",
                              "K-means (arcsin)", "K-means (arcsin)\nHMM",
                              "Gaussian Mixture", "Gaussian Mixture\nHMM"))
  ) %>%
  ggplot(aes(x = POS, y = 1, fill = genotype)) +
  geom_tile() +
  scale_fill_manual(values = pal) +
  facet_grid(method ~ CONTIG, scales = "free") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x = unit(0.0, "lines"),
    panel.spacing.y = unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Comparison of Clustering Methods with HMM Smoothing (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = "Genotype"
  )# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
  pivot_longer(cols = c(K, Kasin, Kgmm, K_HMM, Kasin_HMM, Kgmm_HMM), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    count = n(),
    percentage = n() / nrow(read_freq) * 100,
    .groups = "drop"
  ) %>%
  mutate(
    method_type = ifelse(grepl("_HMM", method), "HMM Smoothed", "Original"),
    method_base = gsub("_HMM", "", method),
    method = case_when(
      method == "K" ~ "K-means",
      method == "K_HMM" ~ "K-means HMM",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
      method == "Kgmm" ~ "Gaussian Mixture",
      method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
      TRUE ~ method
    )
  )
# Add expected frequencies
expectation <- data.frame(
  method = "Expected (BC2S3)",
  genotype = c("REF", "HET", "ALT"),
  count = round(bc2s3 * nrow(read_freq), 0),
  percentage = 100 * bc2s3,
  method_type = "Theoretical",
  method_base = "Expected"
)
# Combine and create frequency table
method_freq <- rbind(expectation, genotype_freq) %>%
  select(method, genotype, percentage) %>%
  pivot_wider(
    id_cols = genotype,
    names_from = method,
    values_from = percentage
  )
# Display frequency table
method_freq %>%
  knitr::kable(
    caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    digits = 1
  )| genotype | Expected (BC2S3) | K-means | K-means HMM | K-means (arcsin) | K-means (arcsin) HMM | Gaussian Mixture | Gaussian Mixture HMM | 
|---|---|---|---|---|---|---|---|
| REF | 85.9 | 97.3 | 97.9 | 96.7 | 97.4 | 95.6 | 97.0 | 
| HET | 3.1 | 1.5 | 0.6 | 1.9 | 1.1 | 1.7 | 0.5 | 
| ALT | 10.9 | 1.2 | 1.5 | 1.4 | 1.5 | 2.6 | 2.5 | 
# Visualize frequencies
all_freq <- rbind(expectation, genotype_freq)
all_freq %>%
  ggplot(aes(x = method, y = percentage, fill = genotype)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = pal) +
  labs(
    title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Method",
    y = "Percentage (%)",
    fill = "Genotype"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  ) # Calculate agreement between methods including HMM versions
methods <- c("K", "Kasin", "Kgmm", "K_HMM", "Kasin_HMM", "Kgmm_HMM")
agreement_matrix <- matrix(NA, length(methods), length(methods))
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods
for (i in 1:length(methods)) {
  for (j in 1:length(methods)) {
    agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
  }
}
# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)| K | Kasin | Kgmm | K_HMM | Kasin_HMM | Kgmm_HMM | |
|---|---|---|---|---|---|---|
| K | 100.0 | 99.2 | 96.9 | 98.8 | 98.8 | 98.2 | 
| Kasin | 99.2 | 100.0 | 97.7 | 98.5 | 98.9 | 98.4 | 
| Kgmm | 96.9 | 97.7 | 100.0 | 97.0 | 97.3 | 98.2 | 
| K_HMM | 98.8 | 98.5 | 97.0 | 100.0 | 99.5 | 98.5 | 
| Kasin_HMM | 98.8 | 98.9 | 97.3 | 99.5 | 100.0 | 98.9 | 
| Kgmm_HMM | 98.2 | 98.4 | 98.2 | 98.5 | 98.9 | 100.0 | 
# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
  tibble::rownames_to_column("Method1") %>%
  pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")
ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
  geom_tile() +
  geom_text(aes(label = round(Agreement, 1)), color = "white", size = 3) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Agreement Between Methods (%)",
    x = NULL,
    y = NULL,
    fill = "Agreement (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )# Calculate how much each method changed after HMM smoothing
hmm_effects <- data.frame(
  method = c("K", "Kasin", "Kgmm"),
  changes = c(
    mean(read_freq$K != read_freq$K_HMM) * 100,
    mean(read_freq$Kasin != read_freq$Kasin_HMM) * 100,
    mean(read_freq$Kgmm != read_freq$Kgmm_HMM) * 100
  )
)
# Display HMM effects
cat("Percentage of bins changed by HMM smoothing:\n")## Percentage of bins changed by HMM smoothing:hmm_effects %>%
  mutate(method = case_when(
    method == "K" ~ "K-means",
    method == "Kasin" ~ "K-means (arcsin)",
    method == "Kgmm" ~ "Gaussian Mixture",
    TRUE ~ method
  )) %>%
  knitr::kable(col.names = c("Method", "% Bins Changed"), digits = 2)| Method | % Bins Changed | 
|---|---|
| K-means | 1.22 | 
| K-means (arcsin) | 1.08 | 
| Gaussian Mixture | 1.78 | 
# Visualize HMM effects
ggplot(hmm_effects, aes(x = reorder(method, changes), y = changes)) +
  geom_col(fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(round(changes, 1), "%")), 
            hjust = -0.1, size = 4) +
  labs(
    title = "Effect of HMM Smoothing on Different Clustering Methods",
    x = "Clustering Method",
    y = "Percentage of Bins Changed (%)"
  ) +
  theme_minimal() +
  coord_flip()# Analyze where methods disagree most
disagreement_data <- read_freq %>%
  mutate(
    POS = BIN_POS * BIN_SIZE,
    # Check if all original methods agree
    original_agreement = factor(ifelse(
      K == Kasin & K == Kgmm,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree")),
    # Check if all HMM methods agree
    hmm_agreement = factor(ifelse(
      K_HMM == Kasin_HMM & K_HMM == Kgmm_HMM,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree"))
  )
# Calculate disagreement statistics
original_disagreement <- mean(disagreement_data$original_agreement == "Methods disagree") * 100
hmm_disagreement <- mean(disagreement_data$hmm_agreement == "Methods disagree") * 100
cat("Original methods disagreement:", round(original_disagreement, 1), "%\n")## Original methods disagreement: 3.1 %## HMM smoothed methods disagreement: 1.5 %cat("Reduction in disagreement:", round(original_disagreement - hmm_disagreement, 1), "percentage points\n")## Reduction in disagreement: 1.6 percentage points# Create tile plot showing disagreement regions
p1 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = original_agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "", scale = 1e-6)) +
  labs(title = "Original Methods Agreement/Disagreement", fill = NULL) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank(),
    panel.spacing.x = unit(0.0, "lines")
  )
p2 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = hmm_agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "", scale = 1e-6)) +
  labs(title = "HMM Smoothed Methods Agreement/Disagreement", 
       x = "Genome Position (Mb)", fill = NULL) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.spacing.x = unit(0.0, "lines")
  )
ggpubr::ggarrange(p1, p2, ncol = 1, common.legend = TRUE)# Calculate actual state transitions from the HMM smoothed results
transition_summary <- data.frame()
for (method in c("K_HMM", "Kasin_HMM", "Kgmm_HMM")) {
  # Process each chromosome separately
  for (chr in paste0("chr",1:10)) {
    # Get states for this chromosome
    chr_states <- read_freq %>% 
      filter(CONTIG == chr) %>% 
      arrange(BIN_POS) %>% 
      pull(!!method)
    
    # Skip empty chromosomes
    if (length(chr_states) == 0) next
    
    # Use run-length encoding to find segments
    rle_result <- rle(as.character(chr_states))
    
    # Get the values (states) and lengths
    states <- rle_result$values
    lengths <- rle_result$lengths
    
    # Count transitions between states (excluding self-transitions)
    if (length(states) > 1) {
      for (i in 1:(length(states)-1)) {
        from_state <- states[i]
        to_state <- states[i+1]
        segment_length <- lengths[i]
        
        # Record this transition
        transition_summary <- rbind(transition_summary, data.frame(
          method = method,
          chromosome = chr,
          from = from_state,
          to = to_state,
          segment_length = segment_length,
          bins = sum(lengths) # total bins in chromosome
        ))
      }
    }
  }
}
# Calculate summary statistics on transitions
if (nrow(transition_summary) > 0) {
  transition_stats <- transition_summary %>%
    group_by(method, from, to) %>%
    summarize(
      count = n(),
      avg_segment_length = mean(segment_length),
      total_transitions = sum(count),
      .groups = "drop"
    ) %>%
    # Calculate transition rates 
    mutate(
      transition_rate = count / nrow(read_freq) ,
      method = case_when(
        method == "K_HMM" ~ "K-means HMM",
        method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
        method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
        TRUE ~ method
      )
    )
  
  # Display transition statistics
  cat("Ancestry state transition rate:\n")
  transition_stats %>%
    select(method, from, to, count, transition_rate) %>%
    arrange(method, from, to) %>%
    knitr::kable(digits = 2)
  
  # Visualize transitions
  ggplot(transition_stats, aes(x = paste(from, "→", to), y = transition_rate, fill = method)) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(
      title = "Ancestry State Transition rate per 1Mb bin",
      x = "Transition Type",
      y = "Transition Rate",
      fill = "Method"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else {
  cat("No transitions found in the data.\n")
}## Ancestry state transition rate:# Calculate segment length distribution
segment_summary <- data.frame()
for (method in c("K_HMM","Kasin_HMM", "Kgmm_HMM")) {
  # Process each chromosome separately
  for (chr in unique(read_freq$CONTIG)) {
    # Get states for this chromosome
    chr_states <- read_freq %>% 
      filter(CONTIG == chr) %>% 
      arrange(BIN_POS) %>% 
      pull(!!method)
    
    # Skip empty chromosomes
    if (length(chr_states) == 0) next
    
    # Use run-length encoding to find segments
    rle_result <- rle(as.character(chr_states))
    
    # Get the values (states) and lengths
    states <- rle_result$values
    lengths <- rle_result$lengths
    
    # Record all segments
    for (i in 1:length(states)) {
      segment_summary <- rbind(segment_summary, data.frame(
        method = method,
        chromosome = chr,
        state = states[i],
        length_bins = lengths[i],
        length_mb = lengths[i] * (BIN_SIZE / 1e6) # Convert bin count to Mb
      ))
    }
  }
}
# Analyze segment lengths
if (nrow(segment_summary) > 0) {
  segment_stats <- segment_summary %>%
    group_by(method, state) %>%
    summarize(
      count = n(),
      mean_length_mb = mean(length_mb),
      median_length_mb = median(length_mb),
      min_length_mb = min(length_mb),
      max_length_mb = max(length_mb),
      .groups = "drop"
    ) %>%
    mutate(
      method = case_when(
        method == "K_HMM" ~ "K-means HMM",
        method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
        method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
        TRUE ~ method
      )
    )
  
  # Display segment statistics
  cat("\nAncestry segment length statistics (in Mb):\n")
  segment_stats %>%
    select(method, state, count, mean_length_mb, median_length_mb, max_length_mb) %>%
    arrange(method, state) %>%
    knitr::kable(digits = 2)
  
  # Visualize segment length distribution for donor introgression segments (ALT)
  alt_segments <- segment_summary %>%
    filter(state == "ALT") %>%
    mutate(
      method = case_when(
        method == "K_HMM" ~ "K-means HMM",
        method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
        method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
        TRUE ~ method
      )
    )
  
  if (nrow(alt_segments) > 0) {
    ggplot(alt_segments, aes(x = length_mb, fill = method)) +
      geom_histogram(binwidth = 1, position = "dodge") +
      labs(
        title = "Distribution of Donor Introgression Segment Lengths",
        x = "Segment Length (Mb)",
        y = "Count",
        fill = "Method"
      ) +
      theme_minimal() +
      xlim(0, min(50, max(alt_segments$length_mb) + 5)) +
      facet_wrap(~method, scales = "free_y")
  }
}## 
## Ancestry segment length statistics (in Mb):## 
## Detailed transition patterns:transition_summary %>%
  mutate(
    transition = paste(from, "=>", to),
    method = case_when(
      method == "K" ~ "K-means",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>% select(method,transition) %>% table()##            transition
## method      ALT => HET ALT => REF HET => ALT HET => REF REF => ALT REF => HET
##   K_HMM              0          2          1          0          2          1
##   Kasin_HMM          1          1          1          1          2          1
##   Kgmm_HMM           1          3          0          1          4          0Based on this comprehensive analysis of ancestry calling methods with HMM smoothing:
Method Performance: All clustering methods show reasonable performance, with the Gaussian Mixture Model and arcsin-transformed K-means showing slightly better separation of genotype classes.
HMM Smoothing Benefits:
Genotype Frequency Alignment: HMM smoothing brings observed frequencies closer to theoretical BC2S3 expectations, particularly for ALT regions.
Kgmm_HMM) for final
ancestry calls, as it:
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS 15.3.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1        ggpubr_0.6.0        scales_1.4.0       
## [4] tidyr_1.3.1         HMM_1.0.1           rebmix_2.16.0      
## [7] Ckmeans.1d.dp_4.3.5 ggplot2_3.5.2       dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     rstatix_0.7.2      digest_0.6.37     
##  [5] magrittr_2.0.3     evaluate_1.0.3     grid_4.4.1         RColorBrewer_1.1-3
##  [9] fastmap_1.2.0      jsonlite_2.0.0     backports_1.5.0    Formula_1.2-5     
## [13] tinytex_0.57       gridExtra_2.3      purrr_1.0.4        jquerylib_0.1.4   
## [17] pdftools_3.5.0     abind_1.4-8        Rdpack_2.6.4       cli_3.6.5         
## [21] rlang_1.1.6        rbibutils_2.3      cowplot_1.1.3      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        tools_4.4.1        ggsignif_0.6.4    
## [29] broom_1.0.8        vctrs_0.6.5        R6_2.6.1           magick_2.8.6      
## [33] lifecycle_1.0.4    car_3.1-3          pkgconfig_2.0.3    pillar_1.10.2     
## [37] bslib_0.9.0        gtable_0.3.6       glue_1.8.0         Rcpp_1.0.14       
## [41] xfun_0.52          tidyselect_1.2.1   rstudioapi_0.17.1  knitr_1.50        
## [45] dichromat_2.0-0.1  farver_2.1.2       htmltools_0.5.8.1  labeling_0.4.3    
## [49] rmarkdown_2.29     carData_3.0-5      qpdf_1.3.5         compiler_4.4.1    
## [53] askpass_1.2.1