Maize Genome Ancestry Analysis with Adjustable Bin Size

Overview

This document outlines a genomic analysis performed on maize genetic data. The goal is to identify genomic ancestry across bins of size 1Mb using the following approach:

Bins with no alternative alleles (ALT_FREQ_NORM = 0) are automatically classified as “REF”
For the remaining bins (ALT_FREQ_NORM > 0), K-means clustering with k=3 is applied
The clusters are assigned “REF” (lowest ALT_FREQ_NORM), “HET” (intermediate ALT_FREQ_NORM), and “ALT” (highest ALT_FREQ_NORM)
We replace the “REF” assignment from step 3 with the automatic “REF” assignment from step 1 for bins with ALT_FREQ_NORM = 0

Four different clustering methods are compared: 1. K-means with normalized ALT_FREQ 2. K-means with arcsin transformation of normalized ALT_FREQ 3. K-means with log10 transformation of normalized ALT_FREQ 4. Gaussian mixture model using the REBMIX library

Load Required Packages

# Load required libraries
library(dplyr)        # Data manipulation
library(ggplot2)      # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix)       # Gaussian mixture model
library(tidyr)        # Data reshaping
library(scales)       # Unit formatting for plots
library(ggpubr)       # Plot arrangement
library(tibble)       # Modern data frames

Data Preparation

First, we read the binned allelic counts:

# Create bins of specified size and calculate summary statistics for each bin
read_freq <- read.table("~/DEsktop/PN17_SID1632_bin_genotypes.tsv",header=TRUE)
# Display first few rows of the binned data
head(read_freq)

##   CONTIG POS_BIN     n NCOUNT ALT_COUNT    ALT_FREQ BIN_START BIN_END
## 1   chr1       1 10062   8884        36 0.004052229      3830  999990
## 2   chr1       2 16194  12748        28 0.002196423   1000069 1999892
## 3   chr1       3 18715  13955        32 0.002293085   2000252 2999995
## 4   chr1       4 15972  12956        24 0.001852424   3000032 3989703
## 5   chr1       5 18368  14794        42 0.002838989   4002810 4999974
## 6   chr1       6 16359  13049        29 0.002222393   5000001 5997031
##   ALT_FREQ_NORM GENOTYPE
## 1   0.002547013      REF
## 2   0.002221892      REF
## 3   0.002680790      REF
## 4   0.001848215      REF
## 5   0.003257455      REF
## 6   0.002271069      REF

# Display summary of bins
cat("Total number of bins:", nrow(read_freq), "\n")

## Total number of bins: 2132

cat("Average number of variants per bin:", mean(read_freq$n), "\n")

## Average number of variants per bin: 16008.37

cat("Median number of variants per bin:", median(read_freq$n), "\n")

## Median number of variants per bin: 15789

cat("Range of variants per bin:", min(read_freq$n), "to", max(read_freq$n), "\n")

## Range of variants per bin: 17 to 27631

Normalization by Variant Count

We’ll normalize the alternative allele frequency by the number of variants in each bin:

# Calculate normalization statistics
mean_n <- mean(read_freq$n)
sd_n <- sd(read_freq$n)

# Create normalized ALT_FREQ (accounting for variant density)
read_freq <- read_freq %>%
  mutate(
    # Normalize by variant count
    ALT_FREQ_NORM = ALT_FREQ * (n / mean_n),
    # Cap extreme values (optional)
    ALT_FREQ_NORM = pmin(pmax(ALT_FREQ_NORM, 0), 1)
  )

# Show the relationship between variant count and ALT_FREQ
ggplot(read_freq, aes(x = n, y = ALT_FREQ)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = paste("Relationship between variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
       x = "Number of variants in bin", 
       y = "Alternative allele frequency") +
  theme_minimal()

# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
  theme_minimal()

p2 <- ggplot(read_freq, aes(x = ALT_FREQ_NORM)) +
  geom_histogram(bins = 30, fill = "tomato", color = "white") +
  labs(title = "Normalized ALT_FREQ_NORM distribution", x = "ALT_FREQ_NORM") +
  theme_minimal()

ggpubr::ggarrange(p1, p2, ncol = 2)

Identify Zero-ALT Bins

We’ll now identify bins with no alternative alleles:

# Check how many bins have ALT_FREQ_NORM = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_NORM == 0)
total_bins <- nrow(read_freq)

cat("Bins with ALT_FREQ_NORM = 0:", 
    zero_alt_bins, "out of", total_bins, 
    "(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")

## Bins with ALT_FREQ_NORM = 0: 4 out of 2132 ( 0.2 %)

# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
  filter(ALT_FREQ_NORM > 0)

cat("Bins with ALT_FREQ_NORM > 0 (to be clustered):", 
    nrow(non_zero_bins), "out of", total_bins,
    "(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")

## Bins with ALT_FREQ_NORM > 0 (to be clustered): 2128 out of 2132 ( 99.8 %)

Data Transformation for Non-Zero Bins

We create transformed versions of the normalized ALT_FREQ data for non-zero bins:

# Calculate minimum possible normalized frequency (for log transformation)
min_read_freq_norm <-   1/(1/min(read_freq$ALT_FREQ_NORM[read_freq$ALT_FREQ_NORM>0])+1)


# Create transformations
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_NORM + min_read_freq_norm) # Log transformation
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_NORM))           # Arcsin transformation

# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_NORM)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = paste("Normalized ALT_FREQ_NORM distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""), 
       x = "ALT_FREQ_NORM") +
  theme_minimal()

p2 <- ggplot(non_zero_bins, aes(x = asin)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Arcsin transformation (non-zero bins)",
       x = "asin") +
  theme_minimal()

p3 <- ggplot(non_zero_bins, aes(x = log)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Log10 transformation (non-zero bins)",
       x = "log") +
  theme_minimal()

ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)

K-means Clustering with K=3 for Non-Zero Bins

We apply K-means clustering to each version of the normalized data for the non-zero bins, using k=3:

K=3
# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_NORM, K)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, K)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, K)$cluster)

# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ_NORM
relabel_clusters <- function(clusters, data) {
  # Calculate mean ALT_FREQ_NORM for each cluster
  cluster_means <- tapply(data$ALT_FREQ_NORM, clusters, mean)
  
  # Order clusters by mean ALT_FREQ_NORM
  ordered_clusters <- order(cluster_means)
  
  # Create mapping from original cluster to REF, HET, ALT
  cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
  cluster_map[ordered_clusters[1]] <- "REF"
  cluster_map[ordered_clusters[2]] <- "HET"
  cluster_map[ordered_clusters[3]] <- "ALT"
  
  # Apply mapping to original clusters
  return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}

# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)

# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")

## K-means cluster proportions (non-zero bins):

table(non_zero_bins$K) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.969 0.017 0.014

cat("\nLog-transformed K-means cluster proportions (non-zero bins):\n")

## 
## Log-transformed K-means cluster proportions (non-zero bins):

table(non_zero_bins$Klog) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.338 0.630 0.032

cat("\nArcsin-transformed K-means cluster proportions (non-zero bins):\n")

## 
## Arcsin-transformed K-means cluster proportions (non-zero bins):

table(non_zero_bins$Kasin) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.968 0.018 0.014

Gaussian Mixture Modeling for Non-Zero Bins

As a comparison to K-means, we also apply a Gaussian mixture model using the REBMIX library:

# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
  Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ_NORM)),
  Preprocessing = "histogram",
  cmin = 3,  # Force 3 components
  cmax = 3,
  Criterion = "BIC",
  pdf = "normal"
)

# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)

# Relabel GMM clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)

# Check GMM cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")

## Gaussian Mixture Model cluster proportions (non-zero bins):

table(non_zero_bins$Kgmm) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.960 0.026 0.015

Check Clustering Results for Non-Zero Bins

Let’s examine the clustering results for the non-zero bins:

# Calculate mean ALT_FREQ_NORM for each cluster and method
cluster_summary <- non_zero_bins %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "cluster") %>%
  group_by(method, cluster) %>%
  summarize(
    mean_alt_freq_norm = mean(ALT_FREQ_NORM),
    min_alt_freq_norm = min(ALT_FREQ_NORM),
    max_alt_freq_norm = max(ALT_FREQ_NORM),
    n_bins = n()
  ) %>%
  ungroup()

# Display summary
cluster_summary %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "GMM" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  arrange(method, cluster) %>%
  knitr::kable(digits = 3)

method	cluster	mean_alt_freq_norm	min_alt_freq_norm	max_alt_freq_norm	n_bins
K-means (arcsin)	REF	0.003	0.000	0.016	2060
K-means (arcsin)	HET	0.049	0.020	0.096	39
K-means (arcsin)	ALT	0.170	0.117	0.301	29
K-means (log10)	REF	0.002	0.000	0.002	719
K-means (log10)	HET	0.003	0.002	0.015	1340
K-means (log10)	ALT	0.100	0.016	0.301	69
K-means (raw)	REF	0.003	0.000	0.025	2062
K-means (raw)	HET	0.051	0.028	0.096	37
K-means (raw)	ALT	0.170	0.117	0.301	29
Kgmm	REF	0.002	0.000	0.006	2042
Kgmm	HET	0.035	0.006	0.077	55
Kgmm	ALT	0.165	0.091	0.301	31

# Visualize the clustering results for non-zero bins
# Define a consistent color palette for REF, HET, ALT
pal <- c("gold", "springgreen4", "purple4")
names(pal) <- c("REF", "HET", "ALT")

# Create a function for histograms colored by cluster
create_hist <- function(data, method_col, title) {
  ggplot(data, aes(x = ALT_FREQ_NORM, fill = .data[[method_col]])) +
    geom_histogram(bins = 30, position = "identity", alpha = 0.6) +
    scale_fill_manual(values = pal) +
    labs(title = title, x = "ALT_FREQ_NORM", fill = "Cluster") +
    theme_minimal()
}

# Create histograms for each method
h1 <- create_hist(non_zero_bins, "K", "K-means on normalized ALT_FREQ")
h2 <- create_hist(non_zero_bins, "Klog", "K-means on log10 transformed ALT_FREQ")
h3 <- create_hist(non_zero_bins, "Kasin", "K-means on arcsin transformed ALT_FREQ")
h4 <- create_hist(non_zero_bins, "Kgmm", "Gaussian mixture model on normalized ALT_FREQ")

# Display all histograms
ggpubr::ggarrange(h1, h2, h3, h4, ncol = 2)

## $`1`

## 
## $`2`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

Merge Results Back to Full Dataset

Now we merge the clustering results back to the full dataset, ensuring that bins with ALT_FREQ_NORM = 0 are classified as “REF”:

# Create a new column in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))

# First, assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  # Create an index for non-zero bins in the original data
  non_zero_idx <- which(read_freq$ALT_FREQ_NORM > 0)
  
  # Assign cluster classifications to non-zero bins
  read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}

# Then, force bins with ALT_FREQ_NORM = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ_NORM == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  read_freq[[method]][zero_idx] <- "REF"
}

# The log transform hace nice normala data but high sensitivity to HET
# so I'decide to change thos hets to REF
read_freq$Klog[read_freq$Klog=="HET"] <- "REF"

# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")

## Final cluster proportions for raw K-means:

table(read_freq$K) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.969 0.017 0.014

# Summarize mean ALT_FREQ_NORM for each genotype class across methods
genotype_summary <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    mean_alt_freq_norm = mean(ALT_FREQ_NORM),
    mean_alt_freq = mean(ALT_FREQ),
    median_alt_freq_norm = median(ALT_FREQ_NORM),
    mean_n = mean(n),
    n_bins = n()
  ) %>%
  ungroup()

# Display summary
genotype_summary %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  arrange(method, genotype) %>%
  knitr::kable(digits = 3)

method	genotype	mean_alt_freq_norm	mean_alt_freq	median_alt_freq_norm	mean_n	n_bins
Gaussian Mixture	REF	0.002	0.003	0.002	16033.32	2046
Gaussian Mixture	HET	0.035	0.040	0.034	15385.67	55
Gaussian Mixture	ALT	0.165	0.172	0.150	15466.26	31
K-means (arcsin)	REF	0.003	0.003	0.002	16047.72	2064
K-means (arcsin)	HET	0.049	0.057	0.045	14317.44	39
K-means (arcsin)	ALT	0.170	0.177	0.156	15481.79	29
K-means (log10)	REF	0.003	0.003	0.002	16045.70	2063
K-means (log10)	ALT	0.100	0.107	0.073	14892.32	69
K-means (raw)	REF	0.003	0.003	0.002	16044.19	2066
K-means (raw)	HET	0.051	0.059	0.046	14421.11	37
K-means (raw)	ALT	0.170	0.177	0.156	15481.79	29

Visualization and Comparison

To compare the different clustering methods, we visualize the data in several ways:

1. Scatter plots of ALT_FREQ_NORM colored by genotype

# Create a function for scatter plots
create_scatter <- function(data, method_col, title) {
  data %>%
    mutate(POS = POS_BIN * BIN_SIZE) %>%
    ggplot(aes(x = POS, y = ALT_FREQ_NORM, col = .data[[method_col]])) +
        labs(title = title, x = "Position (bp)", y = "ALT_FREQ_NORM", color = "Genotype") +
    geom_point(size = 1, alpha = 0.7) +
    scale_color_manual(values = pal) +
    facet_wrap(.~CONTIG,ncol=10, scales = "free_x", strip.position = "bottom") +
    theme_bw() +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))
}

read_freq$CONTIG <-factor(read_freq$CONTIG, levels= paste0("chr",1:10))
  
# Create scatter plots for each method
p1 <- create_scatter(read_freq, "K", paste("K-means on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p2 <- create_scatter(read_freq, "Klog", paste("K-means on log10 transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p3 <- create_scatter(read_freq, "Kasin", paste("K-means on arcsin transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p4 <- create_scatter(read_freq, "Kgmm", paste("Gaussian mixture model on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))

# Display all scatter plots
ggpubr::ggarrange(p1, p2, p3, p4, ncol = 1, common.legend = TRUE)

# Create a function for tile plots
create_tile <- function(data, method_col, title) {

  data %>%
    mutate(POS = POS_BIN * BIN_SIZE) %>%
    ggplot(aes(x = POS, y = 1, fill = .data[[method_col]])) +
    geom_tile() +
    facet_wrap( .~CONTIG, ncol = 1, strip.position = "left") +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))+
    scale_fill_manual(values = pal) +
    labs(title = title, x = "Position (bp)", fill = "Genotype") +
    ggpubr::theme_classic2() +
    theme(
      # legend.position = "top",
      strip.background = element_blank(),
      strip.text.y.left = element_text(angle = 0),
      panel.spacing.y=unit(0.0, "lines"),
      axis.line.y = element_blank(),
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank()
    ) 
}

# Create tile plots for PN17_SID1632
# create_tile(read_freq, "GENOTYPE", paste("ALT_FREQ (", BIN_SIZE_LABEL, " bins)"," Sample PN17_SID1632", sep=""))

create_tile(read_freq, "Klog", paste("ALT_FREQ (", BIN_SIZE_LABEL, " bins)"," Sample PN17_SID1632", sep=""))

3. Combined visualization of all methods

# Create a combined plot for all methods
read_freq %>%
  tidyr::pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "K") %>%
  mutate(
    POS = POS_BIN * BIN_SIZE,
    method = factor(method, 
                   levels = c("K", "Klog", "Kasin", "Kgmm"),
                   labels = c("K-means (raw)", "K-means (log10)", "K-means (arcsin)", "Gaussian Mixture"))
  ) %>%
  ggplot(aes(x = POS, y = 1, fill = K)) +
  geom_tile() +
  scale_fill_manual(values = pal) +
  facet_grid(method ~ CONTIG, scales = "free") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x=unit(0.0, "lines"),
    panel.spacing.y=unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Comparison of Clustering Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = "Genotype"
  )

Genotype Frequency Comparison

Let’s compare the genotype frequencies across all methods:

# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    count = n(),
    percentage = n() / nrow(read_freq) * 100
  ) %>%
  ungroup() %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  )

# Display frequency table
genotype_freq %>%
  pivot_wider(
    id_cols = genotype,
    names_from = method,
    values_from = percentage
  ) %>%
  knitr::kable(
    caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    digits = 1
  )

Genotype frequencies (%) across methods (1Mb bins)
genotype	K-means (raw)	K-means (arcsin)	Gaussian Mixture	K-means (log10)
REF	96.9	96.8	96.0	96.8
HET	1.7	1.8	2.6	NA
ALT	1.4	1.4	1.5	3.2

# Create a bar plot to visualize frequencies
ggplot(genotype_freq, aes(x = method, y = percentage, fill = genotype)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = pal) +
  labs(
    title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Method",
    y = "Percentage (%)",
    fill = "Genotype"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

Method Agreement Analysis

Let’s analyze how much the different clustering methods agree with each other:

# Create a function to calculate agreement between methods
agreement_matrix <- matrix(NA, 4, 4)
methods <- c("K", "Klog", "Kasin", "Kgmm")
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods

for (i in 1:4) {
  for (j in 1:4) {
    agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
  }
}

# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)

Percentage agreement between methods
	K	Klog	Kasin	Kgmm
K	100.0	98.1	99.9	99.0
Klog	98.1	100.0	98.1	97.4
Kasin	99.9	98.1	100.0	99.1
Kgmm	99.0	97.4	99.1	100.0

# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
  tibble::rownames_to_column("Method1") %>%
  pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")

ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
  geom_tile() +
  geom_text(aes(label = round(Agreement, 1)), color = "white", size = 5) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Agreement Between Clustering Methods (%)",
    x = NULL,
    y = NULL,
    fill = "Agreement (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )

Examine Differences Between Methods

Let’s look at where the methods disagree:

# Create a visualization of where methods disagree, across all chromosomes
disagreement_data <- read_freq %>%
  mutate(
    POS = POS_BIN * BIN_SIZE,
    # Check if all methods agree
    agreement = factor(ifelse(
      K == Klog & K == Kasin & K == Kgmm,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree"))
  )

# Create tile plot with chromosomes on facets
ggplot(disagreement_data, aes(x = POS, y = 1, fill = agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x=unit(0.0, "lines"),
    panel.spacing.y=unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Regions of Agreement/Disagreement Between Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = NULL
  )

Effect of Bin Size

The bin size used in this analysis is 1Mb. Adjusting the bin size can have several effects on the analysis:

Resolution: Smaller bins provide higher resolution but may have more noise due to fewer variants per bin. Larger bins provide smoother results but lower resolution.
Statistical power: Larger bins typically contain more variants, which can provide more robust allele frequency estimates.
Biological relevance: The appropriate bin size may depend on the biological question. For example, analyzing recombination might require smaller bins than analyzing large structural variants.

To explore different bin sizes, you can change the BIN_SIZE parameter at the top of this document and rerun the analysis.

Save Results

We can save the final results for further analysis:

# Extract relevant columns
results <- read_freq %>%
  select(POS_BIN, n, NCOUNT, ALT_COUNT, ALT_FREQ, ALT_FREQ_NORM, K)

# Save to file with bin size in the filename
output_file <- paste0("maize_bin_ancestry_", BIN_SIZE_LABEL, ".csv")
write.csv(results, output_file, row.names = FALSE)

cat("Results saved to:", output_file, "\n")

Conclusion

We implemented a three-class clustering approach for genomic ancestry assignment in bins of size 1Mb, with special handling for bins with no alternative alleles:

Bins with no alternative alleles (ALT_FREQ_NORM = 0) were automatically classified as “REF”
For the remaining bins (ALT_FREQ_NORM > 0), K-means clustering with k=3 was applied
The clusters were labeled “REF” (lowest ALT_FREQ_NORM), “HET” (intermediate), and “ALT” (highest)
We replaced any clustering-assigned “REF” with forced “REF” assignments for bins with zero ALT alleles

After comparing four clustering methods on the non-zero bins: 1. K-means on normalized ALT_FREQ 2. K-means on log10-transformed normalized ALT_FREQ 3. K-means on arcsin-transformed normalized ALT_FREQ 4. Gaussian mixture model on normalized ALT_FREQ

We found substantial agreement between the methods, with K-means on raw normalized ALT_FREQ being our preferred approach. This approach has several advantages:

It directly enforces the biological constraint that bins with no alternative alleles must be REF
It normalizes by variant count to account for coverage variations
It allows for the detection of three distinct genotype classes even within the non-zero ALT_FREQ bins

The three genotype clusters effectively represent: - REF: Regions with zero or very low normalized alternative allele frequency, suggesting reference homozygous genotype - HET: Regions with intermediate normalized alternative allele frequency, suggesting heterozygous genotype - ALT: Regions with high normalized alternative allele frequency, suggesting alternative homozygous genotype

This approach provides a biologically interpretable framework for analyzing genomic ancestry patterns in maize.

Session Information

# Record session information for reproducibility
sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.1.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1        ggpubr_0.6.0        scales_1.3.0       
## [4] tidyr_1.3.1         rebmix_2.16.0       Ckmeans.1d.dp_4.3.5
## [7] ggplot2_3.5.2       dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10       generics_0.1.3    rstatix_0.7.2     lattice_0.22-6   
##  [5] digest_0.6.37     magrittr_2.0.3    evaluate_1.0.3    grid_4.4.2       
##  [9] fastmap_1.2.0     Matrix_1.7-1      jsonlite_2.0.0    backports_1.5.0  
## [13] gridExtra_2.3     mgcv_1.9-1        purrr_1.0.4       jquerylib_0.1.4  
## [17] abind_1.4-5       Rdpack_2.6.1      cli_3.6.4         rlang_1.1.6      
## [21] rbibutils_2.2.16  cowplot_1.1.3     munsell_0.5.1     splines_4.4.2    
## [25] withr_3.0.2       cachem_1.1.0      yaml_2.3.10       tools_4.4.2      
## [29] ggsignif_0.6.4    colorspace_2.1-1  broom_1.0.8       vctrs_0.6.5      
## [33] R6_2.6.1          lifecycle_1.0.4   car_3.1-2         pkgconfig_2.0.3  
## [37] pillar_1.10.2     bslib_0.9.0       gtable_0.3.6      glue_1.8.0       
## [41] Rcpp_1.0.14       xfun_0.52         tidyselect_1.2.1  rstudioapi_0.17.1
## [45] knitr_1.50        farver_2.1.2      htmltools_0.5.8.1 nlme_3.1-166     
## [49] labeling_0.4.3    rmarkdown_2.29    carData_3.0-5     compiler_4.4.2