This document outlines a genomic analysis performed on maize genetic data. The goal is to identify genomic ancestry across bins of size 1Mb using the following approach:
Four different clustering methods are compared: 1. K-means with normalized ALT_FREQ 2. K-means with arcsin transformation of normalized ALT_FREQ 3. K-means with log10 transformation of normalized ALT_FREQ 4. Gaussian mixture model using the REBMIX library
# Load required libraries
library(dplyr)        # Data manipulation
library(ggplot2)      # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix)       # Gaussian mixture model
library(tidyr)        # Data reshaping
library(scales)       # Unit formatting for plots
library(ggpubr)       # Plot arrangement
library(tibble)       # Modern data framesFirst, we read the binned allelic counts:
# Create bins of specified size and calculate summary statistics for each bin
read_freq <- read.table("~/DEsktop/PN17_SID1632_bin_genotypes.tsv",header=TRUE)
# Display first few rows of the binned data
head(read_freq)##   CONTIG POS_BIN     n NCOUNT ALT_COUNT    ALT_FREQ BIN_START BIN_END
## 1   chr1       1 10062   8884        36 0.004052229      3830  999990
## 2   chr1       2 16194  12748        28 0.002196423   1000069 1999892
## 3   chr1       3 18715  13955        32 0.002293085   2000252 2999995
## 4   chr1       4 15972  12956        24 0.001852424   3000032 3989703
## 5   chr1       5 18368  14794        42 0.002838989   4002810 4999974
## 6   chr1       6 16359  13049        29 0.002222393   5000001 5997031
##   ALT_FREQ_NORM GENOTYPE
## 1   0.002547013      REF
## 2   0.002221892      REF
## 3   0.002680790      REF
## 4   0.001848215      REF
## 5   0.003257455      REF
## 6   0.002271069      REF## Total number of bins: 2132## Average number of variants per bin: 16008.37## Median number of variants per bin: 15789## Range of variants per bin: 17 to 27631We’ll normalize the alternative allele frequency by the number of variants in each bin:
# Calculate normalization statistics
mean_n <- mean(read_freq$n)
sd_n <- sd(read_freq$n)
# Create normalized ALT_FREQ (accounting for variant density)
read_freq <- read_freq %>%
  mutate(
    # Normalize by variant count
    ALT_FREQ_NORM = ALT_FREQ * (n / mean_n),
    # Cap extreme values (optional)
    ALT_FREQ_NORM = pmin(pmax(ALT_FREQ_NORM, 0), 1)
  )
# Show the relationship between variant count and ALT_FREQ
ggplot(read_freq, aes(x = n, y = ALT_FREQ)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = paste("Relationship between variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
       x = "Number of variants in bin", 
       y = "Alternative allele frequency") +
  theme_minimal()# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
  theme_minimal()
p2 <- ggplot(read_freq, aes(x = ALT_FREQ_NORM)) +
  geom_histogram(bins = 30, fill = "tomato", color = "white") +
  labs(title = "Normalized ALT_FREQ_NORM distribution", x = "ALT_FREQ_NORM") +
  theme_minimal()
ggpubr::ggarrange(p1, p2, ncol = 2)We’ll now identify bins with no alternative alleles:
# Check how many bins have ALT_FREQ_NORM = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_NORM == 0)
total_bins <- nrow(read_freq)
cat("Bins with ALT_FREQ_NORM = 0:", 
    zero_alt_bins, "out of", total_bins, 
    "(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")## Bins with ALT_FREQ_NORM = 0: 4 out of 2132 ( 0.2 %)# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
  filter(ALT_FREQ_NORM > 0)
cat("Bins with ALT_FREQ_NORM > 0 (to be clustered):", 
    nrow(non_zero_bins), "out of", total_bins,
    "(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")## Bins with ALT_FREQ_NORM > 0 (to be clustered): 2128 out of 2132 ( 99.8 %)We create transformed versions of the normalized ALT_FREQ data for non-zero bins:
# Calculate minimum possible normalized frequency (for log transformation)
min_read_freq_norm <-   1/(1/min(read_freq$ALT_FREQ_NORM[read_freq$ALT_FREQ_NORM>0])+1)
# Create transformations
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_NORM + min_read_freq_norm) # Log transformation
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_NORM))           # Arcsin transformation
# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_NORM)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = paste("Normalized ALT_FREQ_NORM distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""), 
       x = "ALT_FREQ_NORM") +
  theme_minimal()
p2 <- ggplot(non_zero_bins, aes(x = asin)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Arcsin transformation (non-zero bins)",
       x = "asin") +
  theme_minimal()
p3 <- ggplot(non_zero_bins, aes(x = log)) + 
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Log10 transformation (non-zero bins)",
       x = "log") +
  theme_minimal()
ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)We apply K-means clustering to each version of the normalized data for the non-zero bins, using k=3:
K=3
# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_NORM, K)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, K)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, K)$cluster)
# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ_NORM
relabel_clusters <- function(clusters, data) {
  # Calculate mean ALT_FREQ_NORM for each cluster
  cluster_means <- tapply(data$ALT_FREQ_NORM, clusters, mean)
  
  # Order clusters by mean ALT_FREQ_NORM
  ordered_clusters <- order(cluster_means)
  
  # Create mapping from original cluster to REF, HET, ALT
  cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
  cluster_map[ordered_clusters[1]] <- "REF"
  cluster_map[ordered_clusters[2]] <- "HET"
  cluster_map[ordered_clusters[3]] <- "ALT"
  
  # Apply mapping to original clusters
  return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}
# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)
# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")## K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.969 0.017 0.014## 
## Log-transformed K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.338 0.630 0.032## 
## Arcsin-transformed K-means cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.968 0.018 0.014As a comparison to K-means, we also apply a Gaussian mixture model using the REBMIX library:
# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
  Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ_NORM)),
  Preprocessing = "histogram",
  cmin = 3,  # Force 3 components
  cmax = 3,
  Criterion = "BIC",
  pdf = "normal"
)
# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)
# Relabel GMM clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)
# Check GMM cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")## Gaussian Mixture Model cluster proportions (non-zero bins):## 
##   REF   HET   ALT 
## 0.960 0.026 0.015Let’s examine the clustering results for the non-zero bins:
# Calculate mean ALT_FREQ_NORM for each cluster and method
cluster_summary <- non_zero_bins %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "cluster") %>%
  group_by(method, cluster) %>%
  summarize(
    mean_alt_freq_norm = mean(ALT_FREQ_NORM),
    min_alt_freq_norm = min(ALT_FREQ_NORM),
    max_alt_freq_norm = max(ALT_FREQ_NORM),
    n_bins = n()
  ) %>%
  ungroup()
# Display summary
cluster_summary %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "GMM" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  arrange(method, cluster) %>%
  knitr::kable(digits = 3)| method | cluster | mean_alt_freq_norm | min_alt_freq_norm | max_alt_freq_norm | n_bins | 
|---|---|---|---|---|---|
| K-means (arcsin) | REF | 0.003 | 0.000 | 0.016 | 2060 | 
| K-means (arcsin) | HET | 0.049 | 0.020 | 0.096 | 39 | 
| K-means (arcsin) | ALT | 0.170 | 0.117 | 0.301 | 29 | 
| K-means (log10) | REF | 0.002 | 0.000 | 0.002 | 719 | 
| K-means (log10) | HET | 0.003 | 0.002 | 0.015 | 1340 | 
| K-means (log10) | ALT | 0.100 | 0.016 | 0.301 | 69 | 
| K-means (raw) | REF | 0.003 | 0.000 | 0.025 | 2062 | 
| K-means (raw) | HET | 0.051 | 0.028 | 0.096 | 37 | 
| K-means (raw) | ALT | 0.170 | 0.117 | 0.301 | 29 | 
| Kgmm | REF | 0.002 | 0.000 | 0.006 | 2042 | 
| Kgmm | HET | 0.035 | 0.006 | 0.077 | 55 | 
| Kgmm | ALT | 0.165 | 0.091 | 0.301 | 31 | 
# Visualize the clustering results for non-zero bins
# Define a consistent color palette for REF, HET, ALT
pal <- c("gold", "springgreen4", "purple4")
names(pal) <- c("REF", "HET", "ALT")
# Create a function for histograms colored by cluster
create_hist <- function(data, method_col, title) {
  ggplot(data, aes(x = ALT_FREQ_NORM, fill = .data[[method_col]])) +
    geom_histogram(bins = 30, position = "identity", alpha = 0.6) +
    scale_fill_manual(values = pal) +
    labs(title = title, x = "ALT_FREQ_NORM", fill = "Cluster") +
    theme_minimal()
}
# Create histograms for each method
h1 <- create_hist(non_zero_bins, "K", "K-means on normalized ALT_FREQ")
h2 <- create_hist(non_zero_bins, "Klog", "K-means on log10 transformed ALT_FREQ")
h3 <- create_hist(non_zero_bins, "Kasin", "K-means on arcsin transformed ALT_FREQ")
h4 <- create_hist(non_zero_bins, "Kgmm", "Gaussian mixture model on normalized ALT_FREQ")
# Display all histograms
ggpubr::ggarrange(h1, h2, h3, h4, ncol = 2)## $`1`## 
## $`2`## 
## attr(,"class")
## [1] "list"      "ggarrange"Now we merge the clustering results back to the full dataset, ensuring that bins with ALT_FREQ_NORM = 0 are classified as “REF”:
# Create a new column in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))
# First, assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  # Create an index for non-zero bins in the original data
  non_zero_idx <- which(read_freq$ALT_FREQ_NORM > 0)
  
  # Assign cluster classifications to non-zero bins
  read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}
# Then, force bins with ALT_FREQ_NORM = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ_NORM == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
  read_freq[[method]][zero_idx] <- "REF"
}
# The log transform hace nice normala data but high sensitivity to HET
# so I'decide to change thos hets to REF
read_freq$Klog[read_freq$Klog=="HET"] <- "REF"
# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")## Final cluster proportions for raw K-means:## 
##   REF   HET   ALT 
## 0.969 0.017 0.014# Summarize mean ALT_FREQ_NORM for each genotype class across methods
genotype_summary <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    mean_alt_freq_norm = mean(ALT_FREQ_NORM),
    mean_alt_freq = mean(ALT_FREQ),
    median_alt_freq_norm = median(ALT_FREQ_NORM),
    mean_n = mean(n),
    n_bins = n()
  ) %>%
  ungroup()
# Display summary
genotype_summary %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  ) %>%
  arrange(method, genotype) %>%
  knitr::kable(digits = 3)| method | genotype | mean_alt_freq_norm | mean_alt_freq | median_alt_freq_norm | mean_n | n_bins | 
|---|---|---|---|---|---|---|
| Gaussian Mixture | REF | 0.002 | 0.003 | 0.002 | 16033.32 | 2046 | 
| Gaussian Mixture | HET | 0.035 | 0.040 | 0.034 | 15385.67 | 55 | 
| Gaussian Mixture | ALT | 0.165 | 0.172 | 0.150 | 15466.26 | 31 | 
| K-means (arcsin) | REF | 0.003 | 0.003 | 0.002 | 16047.72 | 2064 | 
| K-means (arcsin) | HET | 0.049 | 0.057 | 0.045 | 14317.44 | 39 | 
| K-means (arcsin) | ALT | 0.170 | 0.177 | 0.156 | 15481.79 | 29 | 
| K-means (log10) | REF | 0.003 | 0.003 | 0.002 | 16045.70 | 2063 | 
| K-means (log10) | ALT | 0.100 | 0.107 | 0.073 | 14892.32 | 69 | 
| K-means (raw) | REF | 0.003 | 0.003 | 0.002 | 16044.19 | 2066 | 
| K-means (raw) | HET | 0.051 | 0.059 | 0.046 | 14421.11 | 37 | 
| K-means (raw) | ALT | 0.170 | 0.177 | 0.156 | 15481.79 | 29 | 
To compare the different clustering methods, we visualize the data in several ways:
# Create a function for scatter plots
create_scatter <- function(data, method_col, title) {
  data %>%
    mutate(POS = POS_BIN * BIN_SIZE) %>%
    ggplot(aes(x = POS, y = ALT_FREQ_NORM, col = .data[[method_col]])) +
        labs(title = title, x = "Position (bp)", y = "ALT_FREQ_NORM", color = "Genotype") +
    geom_point(size = 1, alpha = 0.7) +
    scale_color_manual(values = pal) +
    facet_wrap(.~CONTIG,ncol=10, scales = "free_x", strip.position = "bottom") +
    theme_bw() +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))
}
read_freq$CONTIG <-factor(read_freq$CONTIG, levels= paste0("chr",1:10))
  
# Create scatter plots for each method
p1 <- create_scatter(read_freq, "K", paste("K-means on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p2 <- create_scatter(read_freq, "Klog", paste("K-means on log10 transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p3 <- create_scatter(read_freq, "Kasin", paste("K-means on arcsin transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p4 <- create_scatter(read_freq, "Kgmm", paste("Gaussian mixture model on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
# Display all scatter plots
ggpubr::ggarrange(p1, p2, p3, p4, ncol = 1, common.legend = TRUE)# Create a function for tile plots
create_tile <- function(data, method_col, title) {
  data %>%
    mutate(POS = POS_BIN * BIN_SIZE) %>%
    ggplot(aes(x = POS, y = 1, fill = .data[[method_col]])) +
    geom_tile() +
    facet_wrap( .~CONTIG, ncol = 1, strip.position = "left") +
    scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))+
    scale_fill_manual(values = pal) +
    labs(title = title, x = "Position (bp)", fill = "Genotype") +
    ggpubr::theme_classic2() +
    theme(
      # legend.position = "top",
      strip.background = element_blank(),
      strip.text.y.left = element_text(angle = 0),
      panel.spacing.y=unit(0.0, "lines"),
      axis.line.y = element_blank(),
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank()
    ) 
}
# Create tile plots for PN17_SID1632
# create_tile(read_freq, "GENOTYPE", paste("ALT_FREQ (", BIN_SIZE_LABEL, " bins)"," Sample PN17_SID1632", sep=""))
create_tile(read_freq, "Klog", paste("ALT_FREQ (", BIN_SIZE_LABEL, " bins)"," Sample PN17_SID1632", sep=""))# Create a combined plot for all methods
read_freq %>%
  tidyr::pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "K") %>%
  mutate(
    POS = POS_BIN * BIN_SIZE,
    method = factor(method, 
                   levels = c("K", "Klog", "Kasin", "Kgmm"),
                   labels = c("K-means (raw)", "K-means (log10)", "K-means (arcsin)", "Gaussian Mixture"))
  ) %>%
  ggplot(aes(x = POS, y = 1, fill = K)) +
  geom_tile() +
  scale_fill_manual(values = pal) +
  facet_grid(method ~ CONTIG, scales = "free") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x=unit(0.0, "lines"),
    panel.spacing.y=unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Comparison of Clustering Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = "Genotype"
  )Let’s compare the genotype frequencies across all methods:
# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
  pivot_longer(cols = c(K, Klog, Kasin, Kgmm), 
               names_to = "method", 
               values_to = "genotype") %>%
  group_by(method, genotype) %>%
  summarize(
    count = n(),
    percentage = n() / nrow(read_freq) * 100
  ) %>%
  ungroup() %>%
  mutate(
    method = case_when(
      method == "K" ~ "K-means (raw)",
      method == "Klog" ~ "K-means (log10)",
      method == "Kasin" ~ "K-means (arcsin)",
      method == "Kgmm" ~ "Gaussian Mixture",
      TRUE ~ method
    )
  )
# Display frequency table
genotype_freq %>%
  pivot_wider(
    id_cols = genotype,
    names_from = method,
    values_from = percentage
  ) %>%
  knitr::kable(
    caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    digits = 1
  )| genotype | K-means (raw) | K-means (arcsin) | Gaussian Mixture | K-means (log10) | 
|---|---|---|---|---|
| REF | 96.9 | 96.8 | 96.0 | 96.8 | 
| HET | 1.7 | 1.8 | 2.6 | NA | 
| ALT | 1.4 | 1.4 | 1.5 | 3.2 | 
# Create a bar plot to visualize frequencies
ggplot(genotype_freq, aes(x = method, y = percentage, fill = genotype)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = pal) +
  labs(
    title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Method",
    y = "Percentage (%)",
    fill = "Genotype"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )Let’s analyze how much the different clustering methods agree with each other:
# Create a function to calculate agreement between methods
agreement_matrix <- matrix(NA, 4, 4)
methods <- c("K", "Klog", "Kasin", "Kgmm")
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods
for (i in 1:4) {
  for (j in 1:4) {
    agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
  }
}
# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)| K | Klog | Kasin | Kgmm | |
|---|---|---|---|---|
| K | 100.0 | 98.1 | 99.9 | 99.0 | 
| Klog | 98.1 | 100.0 | 98.1 | 97.4 | 
| Kasin | 99.9 | 98.1 | 100.0 | 99.1 | 
| Kgmm | 99.0 | 97.4 | 99.1 | 100.0 | 
# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
  tibble::rownames_to_column("Method1") %>%
  pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")
ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
  geom_tile() +
  geom_text(aes(label = round(Agreement, 1)), color = "white", size = 5) +
  scale_fill_gradient(low = "steelblue", high = "darkred") +
  labs(
    title = "Agreement Between Clustering Methods (%)",
    x = NULL,
    y = NULL,
    fill = "Agreement (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )Let’s look at where the methods disagree:
# Create a visualization of where methods disagree, across all chromosomes
disagreement_data <- read_freq %>%
  mutate(
    POS = POS_BIN * BIN_SIZE,
    # Check if all methods agree
    agreement = factor(ifelse(
      K == Klog & K == Kasin & K == Kgmm,
      "All methods agree",
      "Methods disagree"
    ), levels = c("All methods agree", "Methods disagree"))
  )
# Create tile plot with chromosomes on facets
ggplot(disagreement_data, aes(x = POS, y = 1, fill = agreement)) +
  geom_tile() +
  scale_fill_manual(values = c("darkgreen", "darkred")) +
  facet_grid(. ~ CONTIG, scales = "free_x") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  ggpubr::theme_classic2() +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.text.x = element_text(hjust = 0),
    strip.text.y = element_text(angle = 0, face = "bold"),
    panel.spacing.x=unit(0.0, "lines"),
    panel.spacing.y=unit(0.0, "lines"),
    axis.line.y = element_blank(),
    axis.text.y = element_blank(),
    axis.line.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(
    title = paste("Regions of Agreement/Disagreement Between Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
    x = "Genome Position (Mb)",
    fill = NULL
  )The bin size used in this analysis is 1Mb. Adjusting the bin size can have several effects on the analysis:
Resolution: Smaller bins provide higher resolution but may have more noise due to fewer variants per bin. Larger bins provide smoother results but lower resolution.
Statistical power: Larger bins typically contain more variants, which can provide more robust allele frequency estimates.
Biological relevance: The appropriate bin size may depend on the biological question. For example, analyzing recombination might require smaller bins than analyzing large structural variants.
To explore different bin sizes, you can change the
BIN_SIZE parameter at the top of this document and rerun
the analysis.
We can save the final results for further analysis:
# Extract relevant columns
results <- read_freq %>%
  select(POS_BIN, n, NCOUNT, ALT_COUNT, ALT_FREQ, ALT_FREQ_NORM, K)
# Save to file with bin size in the filename
output_file <- paste0("maize_bin_ancestry_", BIN_SIZE_LABEL, ".csv")
write.csv(results, output_file, row.names = FALSE)
cat("Results saved to:", output_file, "\n")We implemented a three-class clustering approach for genomic ancestry assignment in bins of size 1Mb, with special handling for bins with no alternative alleles:
After comparing four clustering methods on the non-zero bins: 1. K-means on normalized ALT_FREQ 2. K-means on log10-transformed normalized ALT_FREQ 3. K-means on arcsin-transformed normalized ALT_FREQ 4. Gaussian mixture model on normalized ALT_FREQ
We found substantial agreement between the methods, with K-means on raw normalized ALT_FREQ being our preferred approach. This approach has several advantages:
The three genotype clusters effectively represent: - REF: Regions with zero or very low normalized alternative allele frequency, suggesting reference homozygous genotype - HET: Regions with intermediate normalized alternative allele frequency, suggesting heterozygous genotype - ALT: Regions with high normalized alternative allele frequency, suggesting alternative homozygous genotype
This approach provides a biologically interpretable framework for analyzing genomic ancestry patterns in maize.
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.1.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1        ggpubr_0.6.0        scales_1.3.0       
## [4] tidyr_1.3.1         rebmix_2.16.0       Ckmeans.1d.dp_4.3.5
## [7] ggplot2_3.5.2       dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10       generics_0.1.3    rstatix_0.7.2     lattice_0.22-6   
##  [5] digest_0.6.37     magrittr_2.0.3    evaluate_1.0.3    grid_4.4.2       
##  [9] fastmap_1.2.0     Matrix_1.7-1      jsonlite_2.0.0    backports_1.5.0  
## [13] gridExtra_2.3     mgcv_1.9-1        purrr_1.0.4       jquerylib_0.1.4  
## [17] abind_1.4-5       Rdpack_2.6.1      cli_3.6.4         rlang_1.1.6      
## [21] rbibutils_2.2.16  cowplot_1.1.3     munsell_0.5.1     splines_4.4.2    
## [25] withr_3.0.2       cachem_1.1.0      yaml_2.3.10       tools_4.4.2      
## [29] ggsignif_0.6.4    colorspace_2.1-1  broom_1.0.8       vctrs_0.6.5      
## [33] R6_2.6.1          lifecycle_1.0.4   car_3.1-2         pkgconfig_2.0.3  
## [37] pillar_1.10.2     bslib_0.9.0       gtable_0.3.6      glue_1.8.0       
## [41] Rcpp_1.0.14       xfun_0.52         tidyselect_1.2.1  rstudioapi_0.17.1
## [45] knitr_1.50        farver_2.1.2      htmltools_0.5.8.1 nlme_3.1-166     
## [49] labeling_0.4.3    rmarkdown_2.29    carData_3.0-5     compiler_4.4.2