Maize Genome Ancestry Analysis with Adjustable Bin Size

Analysis Parameters

You can adjust the bin size by changing the parameter below:

# Set the bin size in base pairs (default: 100,000 = 100kb)
BIN_SIZE <- 1000000  # Change this value to adjust bin size

# Bin size in user-friendly format for plot labels
if (BIN_SIZE >= 1000000) {
  BIN_SIZE_LABEL <- paste0(BIN_SIZE/1000000, "Mb")
} else if (BIN_SIZE >= 1000) {
  BIN_SIZE_LABEL <- paste0(BIN_SIZE/1000, "kb")
} else {
  BIN_SIZE_LABEL <- paste0(BIN_SIZE, "bp")
}

Overview

This document outlines a genomic analysis performed on maize genetic data. The goal is to identify genomic ancestry across bins of size 1Mb using the following approach:

Bins with no alternative alleles (ALT_FREQ_NORM = 0) are automatically classified as “REF”
For the remaining bins (ALT_FREQ_NORM > 0), K-means clustering with k=3 is applied
The clusters are assigned “REF” (lowest ALT_FREQ_NORM), “HET” (intermediate ALT_FREQ_NORM), and “ALT” (highest ALT_FREQ_NORM)
We replace the “REF” assignment from step 3 with the automatic “REF” assignment from step 1 for bins with ALT_FREQ_NORM = 0

Four different clustering methods are compared: 1. K-means with normalized ALT_FREQ 2. K-means with arcsin transformation of normalized ALT_FREQ 3. K-means with log10 transformation of normalized ALT_FREQ 4. Gaussian mixture model using the REBMIX library

Load Required Packages

# Load required libraries
library(dplyr)        # Data manipulation
library(ggplot2)      # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix)       # Gaussian mixture model
library(tidyr)        # Data reshaping
library(scales)       # Unit formatting for plots
library(ggpubr)       # Plot arrangement
library(tibble)       # Modern data frames

Data Preparation

First, we read the allelic counts data and create bins of the specified size:

# Read the allelic counts data
read_count <- read.table("~/Desktop/PN17_SID1632.allelicCounts.tsv", 
                         comment.char = "@", header = TRUE)

# Display dimensions of the input data
nrow(read_count)

## [1] 77459

# Examine distribution of alternative allele counts
with(read_count, table(ALT_COUNT))

## ALT_COUNT
##     0     1     2     3     4 
## 77267   162    22     6     2

# Create bins of specified size and calculate summary statistics for each bin
read_freq <- read_count %>%
  # Create position bins by dividing positions into bins of size BIN_SIZE
  mutate(POS_BIN = ceiling(POSITION/BIN_SIZE) %>% as.integer()) %>%
  # Group by these bins
  group_by(POS_BIN) %>%
  # Calculate statistics for each bin
  summarise(
    n = n(),                                   # Number of variants in bin
    NCOUNT = (sum(ALT_COUNT) + sum(REF_COUNT)), # Total read count
    ALT_COUNT = sum(ALT_COUNT),                # Sum of alternative allele counts
    ALT_FREQ = sum(ALT_COUNT)/NCOUNT           # Alternative allele frequency
  )

# Display first few rows of the binned data
head(read_freq)

## # A tibble: 6 × 5
##   POS_BIN     n NCOUNT ALT_COUNT ALT_FREQ
##     <int> <int>  <int>     <int>    <dbl>
## 1       1   609    523         3  0.00574
## 2       2   753    640         3  0.00469
## 3       3   425    377         2  0.00531
## 4       4   279    218         3  0.0138 
## 5       5   516    380         0  0      
## 6       6   481    440         0  0

# Display summary of bins
cat("Total number of bins:", nrow(read_freq), "\n")

## Total number of bins: 153

cat("Average number of variants per bin:", mean(read_freq$n), "\n")

## Average number of variants per bin: 506.268

cat("Median number of variants per bin:", median(read_freq$n), "\n")

## Median number of variants per bin: 471

cat("Range of variants per bin:", min(read_freq$n), "to", max(read_freq$n), "\n")

## Range of variants per bin: 139 to 1245

Normalization by Variant Count

We’ll normalize the alternative allele frequency by the number of variants in each bin:

# Calculate normalization statistics
mean_n <- mean(read_freq$n)
sd_n <- sd(read_freq$n)

# Create normalized ALT_FREQ (accounting for variant density)
read_freq <- read_freq %>%
  mutate(
    # Normalize by variant count
    ALT_FREQ_NORM = ALT_FREQ * (n / mean_n),
    # Cap extreme values (optional)
    ALT_FREQ_NORM = pmin(pmax(ALT_FREQ_NORM, 0), 1)
  )

# Show the relationship between variant count and ALT_FREQ
ggplot(read_freq, aes(x = n, y = ALT_FREQ)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = paste("Relationship between variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
       x = "Number of variants in bin", 
       y = "Alternative allele frequency") +
  theme_minimal()

# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
  theme_minimal()

p2 <- ggplot(read_freq, aes(x = ALT_FREQ_NORM)) +
  geom_histogram(bins = 30, fill = "tomato", color = "white") +
  labs(title = "Normalized ALT_FREQ_NORM distribution", x = "ALT_FREQ_NORM") +
  theme_minimal()

ggpubr::ggarrange(p1, p2, ncol = 2)

Identify Zero-ALT Bins

We’ll now identify bins with no alternative alleles:

# Check how many bins have ALT_FREQ_NORM = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_NORM == 0)
total_bins <- nrow(read_freq)

cat("Bins with ALT_FREQ_NORM = 0:", 
    zero_alt_bins, "out of", total_bins, 
    "(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")

## Bins with ALT_FREQ_NORM = 0: 51 out of 153 ( 33.3 %)

# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
  filter(ALT_FREQ_NORM > 0)

cat("Bins with ALT_FREQ_NORM > 0 (to be clustered):", 
    nrow(non_zero_bins), "out of", total_bins,
    "(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")

## Bins with ALT_FREQ_NORM > 0 (to be clustered): 102 out of 153 ( 66.7 %)

Data Transformation for Non-Zero Bins

We create transformed versions of the normalized ALT_FREQ data for non-zero bins:

# Calculate minimum possible normalized frequency (for log transformation)
min_read_freq_norm <-   1/(1/min(read_freq$ALT_FREQ_NORM[read_freq$ALT_FREQ_NORM>0])+1)


# Create transformations
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_NORM + min_read_freq_norm) # Log transformation
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_NORM))           # Arcsin transformation

# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_NORM)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = paste("Normalized ALT_FREQ_NORM distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""), 
       x = "ALT_FREQ_NORM") +
  theme_minimal()

p2 <- ggplot(non_zero_bins, aes(x = asin)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Arcsin transformation (non-zero bins)",
       x = "asin") +
  theme_minimal()

p3 <- ggplot(non_zero_bins, aes(x = log)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Log10 transformation (non-zero bins)",
       x = "log") +
  theme_minimal()

ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)

K-means Clustering with K=3 for Non-Zero Bins

We apply K-means clustering to each version of the normalized data for the non-zero bins, using k=3:

# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_NORM, 3)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, 3)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, 3)$cluster)

# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ_NORM
relabel_clusters <- function(clusters, data) {
  # Calculate mean ALT_FREQ_NORM for each cluster
  cluster_means <- tapply(data$ALT_FREQ_NORM, clusters, mean)
  
  # Order clusters by mean ALT_FREQ_NORM
  ordered_clusters <- order(cluster_means)
  
  # Create mapping from original cluster to REF, HET, ALT
  cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
  cluster_map[ordered_clusters[1]] <- "REF"
  cluster_map[ordered_clusters[2]] <- "HET"
  cluster_map[ordered_clusters[3]] <- "ALT"
  
  # Apply mapping to original clusters
  return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}

# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)

# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")

## K-means cluster proportions (non-zero bins):

table(non_zero_bins$K) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.647 0.275 0.078

cat("\nLog-transformed K-means cluster proportions (non-zero bins):\n")

## 
## Log-transformed K-means cluster proportions (non-zero bins):

table(non_zero_bins$Klog) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.480 0.363 0.157

cat("\nArcsin-transformed K-means cluster proportions (non-zero bins):\n")

## 
## Arcsin-transformed K-means cluster proportions (non-zero bins):

table(non_zero_bins$Kasin) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.480 0.412 0.108

Gaussian Mixture Modeling for Non-Zero Bins

As a comparison to K-means, we also apply a Gaussian mixture model using the REBMIX library:

# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
  Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ_NORM)),
  Preprocessing = "histogram",
  cmin = 3,  # Force 3 components
  cmax = 3,
  Criterion = "BIC",
  pdf = "normal"
)

# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)

# Relabel GMM clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)

# Check GMM cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")

## Gaussian Mixture Model cluster proportions (non-zero bins):

table(non_zero_bins$Kgmm) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.461 0.422 0.118

Check Clustering Results for Non-Zero Bins

Let’s examine the clustering results for the non-zero bins:

# Calculate mean ALT_FREQ_NORM for each cluster and method
cluster_summary <- non_zero_bins %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "cluster") %>%
  group_by(method, cluster) %>%
  summarize(
    mean_alt_freq_norm = mean(ALT_FREQ_NORM),
    min_alt_freq_norm = min(ALT_FREQ_NORM),
    max_alt_freq_norm = max(ALT_FREQ_NORM),
    n_bins = n()
  ) %>%
  ungroup()

# Display summary
cluster_summary %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  arrange(method, cluster) %>%
  knitr::kable(digits = 3)

method	cluster	mean_alt_freq_norm	min_alt_freq_norm	max_alt_freq_norm	n_bins
Gaussian Mixture	REF	0.002	0.001	0.003	47
Gaussian Mixture	HET	0.006	0.004	0.009	43
Gaussian Mixture	ALT	0.014	0.009	0.023	12
K-means (arcsin)	REF	0.002	0.001	0.004	49
K-means (arcsin)	HET	0.006	0.004	0.009	42
K-means (arcsin)	ALT	0.014	0.010	0.023	11
K-means (log10)	REF	0.002	0.001	0.004	49
K-means (log10)	HET	0.006	0.004	0.008	37
K-means (log10)	ALT	0.013	0.009	0.023	16
K-means (raw)	REF	0.003	0.001	0.005	66
K-means (raw)	HET	0.007	0.005	0.011	28
K-means (raw)	ALT	0.016	0.013	0.023	8

# Visualize the clustering results for non-zero bins
# Define a consistent color palette for REF, HET, ALT
pal <- c("gold", "springgreen4", "purple4")
names(pal) <- c("REF", "HET", "ALT")

# Create a function for histograms colored by cluster
create_hist <- function(data, method_col, title) {
  ggplot(data, aes(x = ALT_FREQ_NORM, fill = .data[[method_col]])) +
    geom_histogram(bins = 30, position = "identity", alpha = 0.6) +
    scale_fill_manual(values = pal) +
    labs(title = title, x = "ALT_FREQ_NORM", fill = "Cluster") +
    theme_minimal()
}

# Create histograms for each method
h1 <- create_hist(non_zero_bins, "K", "K-means on normalized ALT_FREQ")
h2 <- create_hist(non_zero_bins, "Klog", "K-means on log10 transformed ALT_FREQ")
h3 <- create_hist(non_zero_bins, "Kasin", "K-means on arcsin transformed ALT_FREQ")
h4 <- create_hist(non_zero_bins, "Kgmm", "Gaussian mixture model on normalized ALT_FREQ")

# Display all histograms
ggpubr::ggarrange(h1, h2, h3, h4, ncol = 2)

## $`1`

## 
## $`2`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

Merge Results Back to Full Dataset

Now we merge the clustering results back to the full dataset, ensuring that bins with ALT_FREQ_NORM = 0 are classified as “REF”:

# Create a new column in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))

# First, assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  # Create an index for non-zero bins in the original data
  non_zero_idx <- which(read_freq$ALT_FREQ_NORM > 0)
  
  # Assign cluster classifications to non-zero bins
  read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}

# Then, force bins with ALT_FREQ_NORM = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ_NORM == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  read_freq[[method]][zero_idx] <- "REF"
}

# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")

## Final cluster proportions for raw K-means:

table(read_freq$K) %>% prop.table() %>% round(3)

## 
##   REF   HET   ALT 
## 0.765 0.183 0.052

# Summarize mean ALT_FREQ_NORM for each genotype class across methods
genotype_summary <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    mean_alt_freq_norm = mean(ALT_FREQ_NORM),
    mean_alt_freq = mean(ALT_FREQ),
    median_alt_freq_norm = median(ALT_FREQ_NORM),
    mean_n = mean(n),
    n_bins = n()
  ) %>%
  ungroup()

# Display summary
genotype_summary %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  arrange(method, genotype) %>%
  knitr::kable(digits = 3)

method	genotype	mean_alt_freq_norm	mean_alt_freq	median_alt_freq_norm	mean_n	n_bins
Gaussian Mixture	REF	0.001	0.001	0.000	457.480	98
Gaussian Mixture	HET	0.006	0.005	0.005	587.372	43
Gaussian Mixture	ALT	0.014	0.013	0.014	614.083	12
K-means (arcsin)	REF	0.001	0.001	0.000	455.830	100
K-means (arcsin)	HET	0.006	0.005	0.005	601.024	42
K-means (arcsin)	ALT	0.014	0.014	0.015	603.000	11
K-means (log10)	REF	0.001	0.001	0.000	455.830	100
K-means (log10)	HET	0.006	0.005	0.005	585.297	37
K-means (log10)	ALT	0.013	0.012	0.012	638.750	16
K-means (raw)	REF	0.002	0.002	0.002	469.632	117
K-means (raw)	HET	0.007	0.006	0.007	647.036	28
K-means (raw)	ALT	0.016	0.016	0.015	549.375	8

Visualization and Comparison

To compare the different clustering methods, we visualize the data in several ways:

1. Scatter plots of ALT_FREQ_NORM colored by genotype

# Create a function for scatter plots
create_scatter <- function(data, method_col, title) {
  data %>%
    mutate(POS = POS_BIN * BIN_SIZE) %>%
    ggplot(aes(x = POS, y = ALT_FREQ_NORM, col = .data[[method_col]])) +
    geom_point(size = 1, alpha = 0.7) +
    scale_color_manual(values = pal) +
    labs(title = title, x = "Position (bp)", y = "ALT_FREQ_NORM", color = "Genotype") +
    theme_bw() +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))
}

# Create scatter plots for each method
p1 <- create_scatter(read_freq, "K", paste("K-means on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p2 <- create_scatter(read_freq, "Klog", paste("K-means on log10 transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p3 <- create_scatter(read_freq, "Kasin", paste("K-means on arcsin transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p4 <- create_scatter(read_freq, "Kgmm", paste("Gaussian mixture model on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))

# Display all scatter plots
ggpubr::ggarrange(p1, p2, p3, p4, ncol = 1, common.legend = TRUE)

# Create a combined plot for all methods
read_freq %>%
  tidyr::pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "K") %>%
  mutate(
    POS = POS_BIN * BIN_SIZE,
    method = factor(method, 
                   levels = c("K", "Klog", "Kasin", "Kgmm"),
                   labels = c("K-means (raw)", "K-means (log10)", "K-means (arcsin)", "Gaussian Mixture"))
  ) %>%
  ggplot(aes(x = POS, y = 1, fill = K)) +
  geom_tile() +
  scale_fill_manual(values = pal) +
  facet_wrap(. ~ method, ncol = 1, strip.position = "left") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    strip.background = element_blank(),
    strip.text = element_text(size = 10, face = "bold"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Comparison of Clustering Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = "Genotype"
  )

Genotype Frequency Comparison

Let’s compare the genotype frequencies across all methods:

# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    count = n(),
    percentage = n() / nrow(read_freq) * 100
  ) %>%
  ungroup() %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  )

# Display frequency table
genotype_freq %>%
  pivot_wider(
    id_cols = genotype,
    names_from = method,
    values_from = percentage
  ) %>%
  knitr::kable(
    caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    digits = 1
  )

Genotype frequencies (%) across methods (1Mb bins)
genotype	K-means (raw)	K-means (arcsin)	Gaussian Mixture	K-means (log10)
REF	76.5	65.4	64.1	65.4
HET	18.3	27.5	28.1	24.2
ALT	5.2	7.2	7.8	10.5

# Create a bar plot to visualize frequencies
ggplot(genotype_freq, aes(x = method, y = percentage, fill = genotype)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = pal) +
  labs(
    title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Method",
    y = "Percentage (%)",
    fill = "Genotype"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

Method Agreement Analysis

Let’s analyze how much the different clustering methods agree with each other:

# Create a function to calculate agreement between methods
agreement_matrix <- matrix(NA, 4, 4)
methods <- c("K", "Klog", "Kasin", "Kgmm")
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods

for (i in 1:4) {
  for (j in 1:4) {
    agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
  }
}

# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)

Percentage agreement between methods
	K	Klog	Kasin	Kgmm
K	100.0	83.7	86.9	85.0
Klog	83.7	100.0	96.7	96.1
Kasin	86.9	96.7	100.0	98.0
Kgmm	85.0	96.1	98.0	100.0

# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
  tibble::rownames_to_column("Method1") %>%
  pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")

ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
  geom_tile() +
  geom_text(aes(label = round(Agreement, 1)), color = "white", size = 5) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Agreement Between Clustering Methods (%)",
    x = NULL,
    y = NULL,
    fill = "Agreement (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )

Examine Differences Between Methods

Let’s look at where the methods disagree:

# Compare the standard K-means method to others
for (method in c("Klog", "Kasin", "Kgmm")) {
  cat("\nDifferences between K and", method, ":\n")
  print(table(K = read_freq$K, Other = read_freq[[method]]))
}

## 
## Differences between K and Klog :
##      Other
## K     REF HET ALT
##   REF 100  17   0
##   HET   0  20   8
##   ALT   0   0   8
## 
## Differences between K and Kasin :
##      Other
## K     REF HET ALT
##   REF 100  17   0
##   HET   0  25   3
##   ALT   0   0   8
## 
## Differences between K and Kgmm :
##      Other
## K     REF HET ALT
##   REF  98  19   0
##   HET   0  24   4
##   ALT   0   0   8

# Create a visualization of where methods disagree
disagreement_data <- read_freq %>%
  mutate(
    POS = POS_BIN * BIN_SIZE,
    # Check if all methods agree
    agreement = ifelse(
      K == Klog & K == Kasin & K == Kgmm,
      "All methods agree",
      "Methods disagree"
    )
  )

ggplot(disagreement_data, aes(x = POS, y = 1, fill = agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkred", "darkgreen")) +
  labs(
    title = paste("Regions of Agreement/Disagreement Between Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Position (Mb)",
    fill = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))

Effect of Bin Size

The bin size used in this analysis is 1Mb. Adjusting the bin size can have several effects on the analysis:

Resolution: Smaller bins provide higher resolution but may have more noise due to fewer variants per bin. Larger bins provide smoother results but lower resolution.
Statistical power: Larger bins typically contain more variants, which can provide more robust allele frequency estimates.
Biological relevance: The appropriate bin size may depend on the biological question. For example, analyzing recombination might require smaller bins than analyzing large structural variants.

To explore different bin sizes, you can change the BIN_SIZE parameter at the top of this document and rerun the analysis.

Save Results

We can save the final results for further analysis:

# Extract relevant columns
results <- read_freq %>%
  select(POS_BIN, n, NCOUNT, ALT_COUNT, ALT_FREQ, ALT_FREQ_NORM, K)

# Save to file with bin size in the filename
output_file <- paste0("maize_bin_ancestry_", BIN_SIZE_LABEL, ".csv")
write.csv(results, output_file, row.names = FALSE)

cat("Results saved to:", output_file, "\n")

Conclusion

We implemented a three-class clustering approach for genomic ancestry assignment in bins of size 1Mb, with special handling for bins with no alternative alleles:

Bins with no alternative alleles (ALT_FREQ_NORM = 0) were automatically classified as “REF”
For the remaining bins (ALT_FREQ_NORM > 0), K-means clustering with k=3 was applied
The clusters were labeled “REF” (lowest ALT_FREQ_NORM), “HET” (intermediate), and “ALT” (highest)
We replaced any clustering-assigned “REF” with forced “REF” assignments for bins with zero ALT alleles

After comparing four clustering methods on the non-zero bins: 1. K-means on normalized ALT_FREQ 2. K-means on log10-transformed normalized ALT_FREQ 3. K-means on arcsin-transformed normalized ALT_FREQ 4. Gaussian mixture model on normalized ALT_FREQ

We found substantial agreement between the methods, with K-means on raw normalized ALT_FREQ being our preferred approach. This approach has several advantages:

It directly enforces the biological constraint that bins with no alternative alleles must be REF
It normalizes by variant count to account for coverage variations
It allows for the detection of three distinct genotype classes even within the non-zero ALT_FREQ bins

The three genotype clusters effectively represent: - REF: Regions with zero or very low normalized alternative allele frequency, suggesting reference homozygous genotype - HET: Regions with intermediate normalized alternative allele frequency, suggesting heterozygous genotype - ALT: Regions with high normalized alternative allele frequency, suggesting alternative homozygous genotype

This approach provides a biologically interpretable framework for analyzing genomic ancestry patterns in maize.

Session Information

# Record session information for reproducibility
sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.1.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1        ggpubr_0.6.0        scales_1.3.0       
## [4] tidyr_1.3.1         rebmix_2.16.0       Ckmeans.1d.dp_4.3.5
## [7] ggplot2_3.5.2       dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10       generics_0.1.3    rstatix_0.7.2     lattice_0.22-6   
##  [5] digest_0.6.37     magrittr_2.0.3    evaluate_1.0.3    grid_4.4.2       
##  [9] fastmap_1.2.0     Matrix_1.7-1      jsonlite_2.0.0    backports_1.5.0  
## [13] gridExtra_2.3     mgcv_1.9-1        purrr_1.0.4       jquerylib_0.1.4  
## [17] abind_1.4-5       Rdpack_2.6.1      cli_3.6.4         rlang_1.1.6      
## [21] rbibutils_2.2.16  cowplot_1.1.3     munsell_0.5.1     splines_4.4.2    
## [25] withr_3.0.2       cachem_1.1.0      yaml_2.3.10       tools_4.4.2      
## [29] ggsignif_0.6.4    colorspace_2.1-1  broom_1.0.8       vctrs_0.6.5      
## [33] R6_2.6.1          lifecycle_1.0.4   car_3.1-2         pkgconfig_2.0.3  
## [37] pillar_1.10.2     bslib_0.9.0       gtable_0.3.6      glue_1.8.0       
## [41] Rcpp_1.0.14       xfun_0.52         tidyselect_1.2.1  rstudioapi_0.17.1
## [45] knitr_1.50        farver_2.1.2      htmltools_0.5.8.1 nlme_3.1-166     
## [49] labeling_0.4.3    rmarkdown_2.29    carData_3.0-5     compiler_4.4.2