You can adjust the bin size by changing the parameter below:
# Set the bin size in base pairs (default: 100,000 = 100kb)
BIN_SIZE <- 1000000 # Change this value to adjust bin size
# Bin size in user-friendly format for plot labels
if (BIN_SIZE >= 1000000) {
BIN_SIZE_LABEL <- paste0(BIN_SIZE/1000000, "Mb")
} else if (BIN_SIZE >= 1000) {
BIN_SIZE_LABEL <- paste0(BIN_SIZE/1000, "kb")
} else {
BIN_SIZE_LABEL <- paste0(BIN_SIZE, "bp")
}
This document outlines a genomic analysis performed on maize genetic data. The goal is to identify genomic ancestry across bins of size 1Mb using the following approach:
Four different clustering methods are compared: 1. K-means with normalized ALT_FREQ 2. K-means with arcsin transformation of normalized ALT_FREQ 3. K-means with log10 transformation of normalized ALT_FREQ 4. Gaussian mixture model using the REBMIX library
# Load required libraries
library(dplyr) # Data manipulation
library(ggplot2) # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix) # Gaussian mixture model
library(tidyr) # Data reshaping
library(scales) # Unit formatting for plots
library(ggpubr) # Plot arrangement
library(tibble) # Modern data frames
First, we read the allelic counts data and create bins of the specified size:
# Read the allelic counts data
read_count <- read.table("~/Desktop/PN17_SID1632.allelicCounts.tsv",
comment.char = "@", header = TRUE)
# Display dimensions of the input data
nrow(read_count)
## [1] 77459
## ALT_COUNT
## 0 1 2 3 4
## 77267 162 22 6 2
# Create bins of specified size and calculate summary statistics for each bin
read_freq <- read_count %>%
# Create position bins by dividing positions into bins of size BIN_SIZE
mutate(POS_BIN = ceiling(POSITION/BIN_SIZE) %>% as.integer()) %>%
# Group by these bins
group_by(POS_BIN) %>%
# Calculate statistics for each bin
summarise(
n = n(), # Number of variants in bin
NCOUNT = (sum(ALT_COUNT) + sum(REF_COUNT)), # Total read count
ALT_COUNT = sum(ALT_COUNT), # Sum of alternative allele counts
ALT_FREQ = sum(ALT_COUNT)/NCOUNT # Alternative allele frequency
)
# Display first few rows of the binned data
head(read_freq)
## # A tibble: 6 × 5
## POS_BIN n NCOUNT ALT_COUNT ALT_FREQ
## <int> <int> <int> <int> <dbl>
## 1 1 609 523 3 0.00574
## 2 2 753 640 3 0.00469
## 3 3 425 377 2 0.00531
## 4 4 279 218 3 0.0138
## 5 5 516 380 0 0
## 6 6 481 440 0 0
## Total number of bins: 153
## Average number of variants per bin: 506.268
## Median number of variants per bin: 471
## Range of variants per bin: 139 to 1245
We’ll normalize the alternative allele frequency by the number of variants in each bin:
# Calculate normalization statistics
mean_n <- mean(read_freq$n)
sd_n <- sd(read_freq$n)
# Create normalized ALT_FREQ (accounting for variant density)
read_freq <- read_freq %>%
mutate(
# Normalize by variant count
ALT_FREQ_NORM = ALT_FREQ * (n / mean_n),
# Cap extreme values (optional)
ALT_FREQ_NORM = pmin(pmax(ALT_FREQ_NORM, 0), 1)
)
# Show the relationship between variant count and ALT_FREQ
ggplot(read_freq, aes(x = n, y = ALT_FREQ)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = paste("Relationship between variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Number of variants in bin",
y = "Alternative allele frequency") +
theme_minimal()
# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
theme_minimal()
p2 <- ggplot(read_freq, aes(x = ALT_FREQ_NORM)) +
geom_histogram(bins = 30, fill = "tomato", color = "white") +
labs(title = "Normalized ALT_FREQ_NORM distribution", x = "ALT_FREQ_NORM") +
theme_minimal()
ggpubr::ggarrange(p1, p2, ncol = 2)
We’ll now identify bins with no alternative alleles:
# Check how many bins have ALT_FREQ_NORM = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_NORM == 0)
total_bins <- nrow(read_freq)
cat("Bins with ALT_FREQ_NORM = 0:",
zero_alt_bins, "out of", total_bins,
"(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")
## Bins with ALT_FREQ_NORM = 0: 51 out of 153 ( 33.3 %)
# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
filter(ALT_FREQ_NORM > 0)
cat("Bins with ALT_FREQ_NORM > 0 (to be clustered):",
nrow(non_zero_bins), "out of", total_bins,
"(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")
## Bins with ALT_FREQ_NORM > 0 (to be clustered): 102 out of 153 ( 66.7 %)
We create transformed versions of the normalized ALT_FREQ data for non-zero bins:
# Calculate minimum possible normalized frequency (for log transformation)
min_read_freq_norm <- 1/(1/min(read_freq$ALT_FREQ_NORM[read_freq$ALT_FREQ_NORM>0])+1)
# Create transformations
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_NORM + min_read_freq_norm) # Log transformation
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_NORM)) # Arcsin transformation
# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_NORM)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = paste("Normalized ALT_FREQ_NORM distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""),
x = "ALT_FREQ_NORM") +
theme_minimal()
p2 <- ggplot(non_zero_bins, aes(x = asin)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Arcsin transformation (non-zero bins)",
x = "asin") +
theme_minimal()
p3 <- ggplot(non_zero_bins, aes(x = log)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Log10 transformation (non-zero bins)",
x = "log") +
theme_minimal()
ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)
We apply K-means clustering to each version of the normalized data for the non-zero bins, using k=3:
# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_NORM, 3)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, 3)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, 3)$cluster)
# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ_NORM
relabel_clusters <- function(clusters, data) {
# Calculate mean ALT_FREQ_NORM for each cluster
cluster_means <- tapply(data$ALT_FREQ_NORM, clusters, mean)
# Order clusters by mean ALT_FREQ_NORM
ordered_clusters <- order(cluster_means)
# Create mapping from original cluster to REF, HET, ALT
cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
cluster_map[ordered_clusters[1]] <- "REF"
cluster_map[ordered_clusters[2]] <- "HET"
cluster_map[ordered_clusters[3]] <- "ALT"
# Apply mapping to original clusters
return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}
# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)
# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")
## K-means cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.647 0.275 0.078
##
## Log-transformed K-means cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.480 0.363 0.157
##
## Arcsin-transformed K-means cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.480 0.412 0.108
As a comparison to K-means, we also apply a Gaussian mixture model using the REBMIX library:
# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ_NORM)),
Preprocessing = "histogram",
cmin = 3, # Force 3 components
cmax = 3,
Criterion = "BIC",
pdf = "normal"
)
# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)
# Relabel GMM clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)
# Check GMM cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")
## Gaussian Mixture Model cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.461 0.422 0.118
Let’s examine the clustering results for the non-zero bins:
# Calculate mean ALT_FREQ_NORM for each cluster and method
cluster_summary <- non_zero_bins %>%
pivot_longer(cols = c(K, Klog, Kasin, Kgmm),
names_to = "method",
values_to = "cluster") %>%
group_by(method, cluster) %>%
summarize(
mean_alt_freq_norm = mean(ALT_FREQ_NORM),
min_alt_freq_norm = min(ALT_FREQ_NORM),
max_alt_freq_norm = max(ALT_FREQ_NORM),
n_bins = n()
) %>%
ungroup()
# Display summary
cluster_summary %>%
mutate(
method = case_when(
method == "K" ~ "K-means (raw)",
method == "Klog" ~ "K-means (log10)",
method == "Kasin" ~ "K-means (arcsin)",
method == "Kgmm" ~ "Gaussian Mixture",
TRUE ~ method
)
) %>%
arrange(method, cluster) %>%
knitr::kable(digits = 3)
method | cluster | mean_alt_freq_norm | min_alt_freq_norm | max_alt_freq_norm | n_bins |
---|---|---|---|---|---|
Gaussian Mixture | REF | 0.002 | 0.001 | 0.003 | 47 |
Gaussian Mixture | HET | 0.006 | 0.004 | 0.009 | 43 |
Gaussian Mixture | ALT | 0.014 | 0.009 | 0.023 | 12 |
K-means (arcsin) | REF | 0.002 | 0.001 | 0.004 | 49 |
K-means (arcsin) | HET | 0.006 | 0.004 | 0.009 | 42 |
K-means (arcsin) | ALT | 0.014 | 0.010 | 0.023 | 11 |
K-means (log10) | REF | 0.002 | 0.001 | 0.004 | 49 |
K-means (log10) | HET | 0.006 | 0.004 | 0.008 | 37 |
K-means (log10) | ALT | 0.013 | 0.009 | 0.023 | 16 |
K-means (raw) | REF | 0.003 | 0.001 | 0.005 | 66 |
K-means (raw) | HET | 0.007 | 0.005 | 0.011 | 28 |
K-means (raw) | ALT | 0.016 | 0.013 | 0.023 | 8 |
# Visualize the clustering results for non-zero bins
# Define a consistent color palette for REF, HET, ALT
pal <- c("gold", "springgreen4", "purple4")
names(pal) <- c("REF", "HET", "ALT")
# Create a function for histograms colored by cluster
create_hist <- function(data, method_col, title) {
ggplot(data, aes(x = ALT_FREQ_NORM, fill = .data[[method_col]])) +
geom_histogram(bins = 30, position = "identity", alpha = 0.6) +
scale_fill_manual(values = pal) +
labs(title = title, x = "ALT_FREQ_NORM", fill = "Cluster") +
theme_minimal()
}
# Create histograms for each method
h1 <- create_hist(non_zero_bins, "K", "K-means on normalized ALT_FREQ")
h2 <- create_hist(non_zero_bins, "Klog", "K-means on log10 transformed ALT_FREQ")
h3 <- create_hist(non_zero_bins, "Kasin", "K-means on arcsin transformed ALT_FREQ")
h4 <- create_hist(non_zero_bins, "Kgmm", "Gaussian mixture model on normalized ALT_FREQ")
# Display all histograms
ggpubr::ggarrange(h1, h2, h3, h4, ncol = 2)
## $`1`
##
## $`2`
##
## attr(,"class")
## [1] "list" "ggarrange"
Now we merge the clustering results back to the full dataset, ensuring that bins with ALT_FREQ_NORM = 0 are classified as “REF”:
# Create a new column in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))
# First, assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
# Create an index for non-zero bins in the original data
non_zero_idx <- which(read_freq$ALT_FREQ_NORM > 0)
# Assign cluster classifications to non-zero bins
read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}
# Then, force bins with ALT_FREQ_NORM = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ_NORM == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
read_freq[[method]][zero_idx] <- "REF"
}
# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")
## Final cluster proportions for raw K-means:
##
## REF HET ALT
## 0.765 0.183 0.052
# Summarize mean ALT_FREQ_NORM for each genotype class across methods
genotype_summary <- read_freq %>%
pivot_longer(cols = c(K, Klog, Kasin, Kgmm),
names_to = "method",
values_to = "genotype") %>%
group_by(method, genotype) %>%
summarize(
mean_alt_freq_norm = mean(ALT_FREQ_NORM),
mean_alt_freq = mean(ALT_FREQ),
median_alt_freq_norm = median(ALT_FREQ_NORM),
mean_n = mean(n),
n_bins = n()
) %>%
ungroup()
# Display summary
genotype_summary %>%
mutate(
method = case_when(
method == "K" ~ "K-means (raw)",
method == "Klog" ~ "K-means (log10)",
method == "Kasin" ~ "K-means (arcsin)",
method == "Kgmm" ~ "Gaussian Mixture",
TRUE ~ method
)
) %>%
arrange(method, genotype) %>%
knitr::kable(digits = 3)
method | genotype | mean_alt_freq_norm | mean_alt_freq | median_alt_freq_norm | mean_n | n_bins |
---|---|---|---|---|---|---|
Gaussian Mixture | REF | 0.001 | 0.001 | 0.000 | 457.480 | 98 |
Gaussian Mixture | HET | 0.006 | 0.005 | 0.005 | 587.372 | 43 |
Gaussian Mixture | ALT | 0.014 | 0.013 | 0.014 | 614.083 | 12 |
K-means (arcsin) | REF | 0.001 | 0.001 | 0.000 | 455.830 | 100 |
K-means (arcsin) | HET | 0.006 | 0.005 | 0.005 | 601.024 | 42 |
K-means (arcsin) | ALT | 0.014 | 0.014 | 0.015 | 603.000 | 11 |
K-means (log10) | REF | 0.001 | 0.001 | 0.000 | 455.830 | 100 |
K-means (log10) | HET | 0.006 | 0.005 | 0.005 | 585.297 | 37 |
K-means (log10) | ALT | 0.013 | 0.012 | 0.012 | 638.750 | 16 |
K-means (raw) | REF | 0.002 | 0.002 | 0.002 | 469.632 | 117 |
K-means (raw) | HET | 0.007 | 0.006 | 0.007 | 647.036 | 28 |
K-means (raw) | ALT | 0.016 | 0.016 | 0.015 | 549.375 | 8 |
To compare the different clustering methods, we visualize the data in several ways:
# Create a function for scatter plots
create_scatter <- function(data, method_col, title) {
data %>%
mutate(POS = POS_BIN * BIN_SIZE) %>%
ggplot(aes(x = POS, y = ALT_FREQ_NORM, col = .data[[method_col]])) +
geom_point(size = 1, alpha = 0.7) +
scale_color_manual(values = pal) +
labs(title = title, x = "Position (bp)", y = "ALT_FREQ_NORM", color = "Genotype") +
theme_bw() +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))
}
# Create scatter plots for each method
p1 <- create_scatter(read_freq, "K", paste("K-means on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p2 <- create_scatter(read_freq, "Klog", paste("K-means on log10 transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p3 <- create_scatter(read_freq, "Kasin", paste("K-means on arcsin transformed ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
p4 <- create_scatter(read_freq, "Kgmm", paste("Gaussian mixture model on normalized ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""))
# Display all scatter plots
ggpubr::ggarrange(p1, p2, p3, p4, ncol = 1, common.legend = TRUE)
# Create a combined plot for all methods
read_freq %>%
tidyr::pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "K") %>%
mutate(
POS = POS_BIN * BIN_SIZE,
method = factor(method,
levels = c("K", "Klog", "Kasin", "Kgmm"),
labels = c("K-means (raw)", "K-means (log10)", "K-means (arcsin)", "Gaussian Mixture"))
) %>%
ggplot(aes(x = POS, y = 1, fill = K)) +
geom_tile() +
scale_fill_manual(values = pal) +
facet_wrap(. ~ method, ncol = 1, strip.position = "left") +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
ggpubr::theme_classic2() +
theme(
strip.background = element_blank(),
strip.text = element_text(size = 10, face = "bold"),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank()
) +
labs(
title = paste("Comparison of Clustering Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Genome Position (Mb)",
fill = "Genotype"
)
Let’s compare the genotype frequencies across all methods:
# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
pivot_longer(cols = c(K, Klog, Kasin, Kgmm),
names_to = "method",
values_to = "genotype") %>%
group_by(method, genotype) %>%
summarize(
count = n(),
percentage = n() / nrow(read_freq) * 100
) %>%
ungroup() %>%
mutate(
method = case_when(
method == "K" ~ "K-means (raw)",
method == "Klog" ~ "K-means (log10)",
method == "Kasin" ~ "K-means (arcsin)",
method == "Kgmm" ~ "Gaussian Mixture",
TRUE ~ method
)
)
# Display frequency table
genotype_freq %>%
pivot_wider(
id_cols = genotype,
names_from = method,
values_from = percentage
) %>%
knitr::kable(
caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
digits = 1
)
genotype | K-means (raw) | K-means (arcsin) | Gaussian Mixture | K-means (log10) |
---|---|---|---|---|
REF | 76.5 | 65.4 | 64.1 | 65.4 |
HET | 18.3 | 27.5 | 28.1 | 24.2 |
ALT | 5.2 | 7.2 | 7.8 | 10.5 |
# Create a bar plot to visualize frequencies
ggplot(genotype_freq, aes(x = method, y = percentage, fill = genotype)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = pal) +
labs(
title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Method",
y = "Percentage (%)",
fill = "Genotype"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
)
Let’s analyze how much the different clustering methods agree with each other:
# Create a function to calculate agreement between methods
agreement_matrix <- matrix(NA, 4, 4)
methods <- c("K", "Klog", "Kasin", "Kgmm")
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods
for (i in 1:4) {
for (j in 1:4) {
agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
}
}
# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)
K | Klog | Kasin | Kgmm | |
---|---|---|---|---|
K | 100.0 | 83.7 | 86.9 | 85.0 |
Klog | 83.7 | 100.0 | 96.7 | 96.1 |
Kasin | 86.9 | 96.7 | 100.0 | 98.0 |
Kgmm | 85.0 | 96.1 | 98.0 | 100.0 |
# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
tibble::rownames_to_column("Method1") %>%
pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")
ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
geom_tile() +
geom_text(aes(label = round(Agreement, 1)), color = "white", size = 5) +
scale_fill_gradient(low = "steelblue", high = "darkred") +
labs(
title = "Agreement Between Clustering Methods (%)",
x = NULL,
y = NULL,
fill = "Agreement (%)"
) +
theme_minimal() +
theme(
axis.text = element_text(face = "bold"),
panel.grid = element_blank()
)
Let’s look at where the methods disagree:
# Compare the standard K-means method to others
for (method in c("Klog", "Kasin", "Kgmm")) {
cat("\nDifferences between K and", method, ":\n")
print(table(K = read_freq$K, Other = read_freq[[method]]))
}
##
## Differences between K and Klog :
## Other
## K REF HET ALT
## REF 100 17 0
## HET 0 20 8
## ALT 0 0 8
##
## Differences between K and Kasin :
## Other
## K REF HET ALT
## REF 100 17 0
## HET 0 25 3
## ALT 0 0 8
##
## Differences between K and Kgmm :
## Other
## K REF HET ALT
## REF 98 19 0
## HET 0 24 4
## ALT 0 0 8
# Create a visualization of where methods disagree
disagreement_data <- read_freq %>%
mutate(
POS = POS_BIN * BIN_SIZE,
# Check if all methods agree
agreement = ifelse(
K == Klog & K == Kasin & K == Kgmm,
"All methods agree",
"Methods disagree"
)
)
ggplot(disagreement_data, aes(x = POS, y = 1, fill = agreement)) +
geom_tile() +
scale_fill_manual(values = c("darkred", "darkgreen")) +
labs(
title = paste("Regions of Agreement/Disagreement Between Methods (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Position (Mb)",
fill = NULL
) +
theme_minimal() +
theme(
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank()
) +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6))
The bin size used in this analysis is 1Mb. Adjusting the bin size can have several effects on the analysis:
Resolution: Smaller bins provide higher resolution but may have more noise due to fewer variants per bin. Larger bins provide smoother results but lower resolution.
Statistical power: Larger bins typically contain more variants, which can provide more robust allele frequency estimates.
Biological relevance: The appropriate bin size may depend on the biological question. For example, analyzing recombination might require smaller bins than analyzing large structural variants.
To explore different bin sizes, you can change the
BIN_SIZE
parameter at the top of this document and rerun
the analysis.
We can save the final results for further analysis:
# Extract relevant columns
results <- read_freq %>%
select(POS_BIN, n, NCOUNT, ALT_COUNT, ALT_FREQ, ALT_FREQ_NORM, K)
# Save to file with bin size in the filename
output_file <- paste0("maize_bin_ancestry_", BIN_SIZE_LABEL, ".csv")
write.csv(results, output_file, row.names = FALSE)
cat("Results saved to:", output_file, "\n")
We implemented a three-class clustering approach for genomic ancestry assignment in bins of size 1Mb, with special handling for bins with no alternative alleles:
After comparing four clustering methods on the non-zero bins: 1. K-means on normalized ALT_FREQ 2. K-means on log10-transformed normalized ALT_FREQ 3. K-means on arcsin-transformed normalized ALT_FREQ 4. Gaussian mixture model on normalized ALT_FREQ
We found substantial agreement between the methods, with K-means on raw normalized ALT_FREQ being our preferred approach. This approach has several advantages:
The three genotype clusters effectively represent: - REF: Regions with zero or very low normalized alternative allele frequency, suggesting reference homozygous genotype - HET: Regions with intermediate normalized alternative allele frequency, suggesting heterozygous genotype - ALT: Regions with high normalized alternative allele frequency, suggesting alternative homozygous genotype
This approach provides a biologically interpretable framework for analyzing genomic ancestry patterns in maize.
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.1.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tibble_3.2.1 ggpubr_0.6.0 scales_1.3.0
## [4] tidyr_1.3.1 rebmix_2.16.0 Ckmeans.1d.dp_4.3.5
## [7] ggplot2_3.5.2 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.3 rstatix_0.7.2 lattice_0.22-6
## [5] digest_0.6.37 magrittr_2.0.3 evaluate_1.0.3 grid_4.4.2
## [9] fastmap_1.2.0 Matrix_1.7-1 jsonlite_2.0.0 backports_1.5.0
## [13] gridExtra_2.3 mgcv_1.9-1 purrr_1.0.4 jquerylib_0.1.4
## [17] abind_1.4-5 Rdpack_2.6.1 cli_3.6.4 rlang_1.1.6
## [21] rbibutils_2.2.16 cowplot_1.1.3 munsell_0.5.1 splines_4.4.2
## [25] withr_3.0.2 cachem_1.1.0 yaml_2.3.10 tools_4.4.2
## [29] ggsignif_0.6.4 colorspace_2.1-1 broom_1.0.8 vctrs_0.6.5
## [33] R6_2.6.1 lifecycle_1.0.4 car_3.1-2 pkgconfig_2.0.3
## [37] pillar_1.10.2 bslib_0.9.0 gtable_0.3.6 glue_1.8.0
## [41] Rcpp_1.0.14 xfun_0.52 tidyselect_1.2.1 rstudioapi_0.17.1
## [45] knitr_1.50 farver_2.1.2 htmltools_0.5.8.1 nlme_3.1-166
## [49] labeling_0.4.3 rmarkdown_2.29 carData_3.0-5 compiler_4.4.2