This document outlines a genomic ancestry analysis pipeline for maize introgression lines. The analysis identifies genomic ancestry across bins of size 1Mb using the following approach:
# Load required libraries
library(dplyr) # Data manipulation
library(ggplot2) # Plotting
library(Ckmeans.1d.dp) # K-means clustering
library(rebmix) # Gaussian mixture model
library(HMM) # Hidden Markov Models
library(tidyr) # Data reshaping
library(scales) # Unit formatting for plots
library(ggpubr) # Plot arrangement
library(tibble) # Modern data frames
# Read the binned allelic counts
read_freq <- read.table("~/Desktop/PN17_SID1632_bin_genotypes.tsv", header=TRUE)
# Display first few rows of the binned data
head(read_freq)
## CONTIG BIN_POS VARIANT_COUNT INFORMATIVE_VARIANT_COUNT DEPTH_SUM ALT_COUNT
## 1 chr1 1 8163 3810 7355 34
## 2 chr1 2 13158 5512 10174 20
## 3 chr1 3 14272 6128 11390 25
## 4 chr1 4 11263 5084 9273 16
## 5 chr1 5 12470 5440 10589 38
## 6 chr1 6 10971 4497 8514 21
## ALT_FREQ BIN_START BIN_END ALT_FREQ_NORM GENOTYPE
## 1 0.004622706 5892 999990 0.002914504 REF
## 2 0.001965795 1000069 1999892 0.001997776 REF
## 3 0.002194908 2000252 2999995 0.002419468 REF
## 4 0.001725439 3000032 3989703 0.001500972 REF
## 5 0.003588630 4002810 4999974 0.003456319 REF
## 6 0.002466526 5000001 5997031 0.002090021 REF
# Calculate summary statistics
IVC_mean <- mean(read_freq$INFORMATIVE_VARIANT_COUNT)
IVC_sd <- sd(read_freq$INFORMATIVE_VARIANT_COUNT)
# Display summary of bins
cat("Total number of bins:", nrow(read_freq), "\n")
## Total number of bins: 2132
## Average depth sum: 9686.165
## Average number of informative variants per bin: 5223.613
cat("Median number of informative variants per bin:", median(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")
## Median number of informative variants per bin: 5093.5
cat("Range of informative variants per bin:", min(read_freq$INFORMATIVE_VARIANT_COUNT), "to", max(read_freq$INFORMATIVE_VARIANT_COUNT), "\n")
## Range of informative variants per bin: 13 to 10052
# Create weightesALT_FREQ
read_freq <- read_freq %>%
mutate(
ALT_FREQ_WEIGHTED = ALT_FREQ * (INFORMATIVE_VARIANT_COUNT / IVC_mean)
)
# Visualize the relationship between informative variant count and ALT_FREQ
ggplot(read_freq, aes(x = INFORMATIVE_VARIANT_COUNT, y = ALT_FREQ)) +
geom_point(alpha = 0.6) +
labs(title = paste("Relationship between informative variant count and ALT_FREQ (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Number of informative variants in bin",
y = "Alternative allele frequency") +
theme_minimal()
# Compare original vs normalized frequencies
p1 <- ggplot(read_freq, aes(x = ALT_FREQ)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Original ALT_FREQ distribution", x = "ALT_FREQ") +
theme_minimal()
p2 <- ggplot(read_freq, aes(x = ALT_FREQ_WEIGHTED)) +
geom_histogram(bins = 30, fill = "tomato", color = "white") +
labs(title = "Weighted ALT_FREQ distribution", x = "ALT_FREQ_WEIGHTED") +
theme_minimal()
ggpubr::ggarrange(p1, p2, ncol = 2)
# Check how many bins have ALT_FREQ = 0
zero_alt_bins <- sum(read_freq$ALT_FREQ_WEIGHTED == 0)
total_bins <- nrow(read_freq)
cat("Bins with ALT_FREQ_WEIGHTED = 0:",
zero_alt_bins, "out of", total_bins,
"(", round(100 * zero_alt_bins / total_bins, 1), "%)\n")
## Bins with ALT_FREQ_WEIGHTED = 0: 5 out of 2132 ( 0.2 %)
# Create a separate dataset for non-zero bins (for clustering)
non_zero_bins <- read_freq %>%
filter(ALT_FREQ_WEIGHTED > 0)
cat("Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered):",
nrow(non_zero_bins), "out of", total_bins,
"(", round(100 * nrow(non_zero_bins) / total_bins, 1), "%)\n")
## Bins with ALT_FREQ_WEIGHTED > 0 (to be clustered): 2127 out of 2132 ( 99.8 %)
# Create transformations for better clustering performance
non_zero_bins$log <- log10(non_zero_bins$ALT_FREQ_WEIGHTED)
non_zero_bins$asin <- asin(sqrt(non_zero_bins$ALT_FREQ_WEIGHTED))
# Visualize distributions of the normalized and transformed data
p1 <- ggplot(non_zero_bins, aes(x = ALT_FREQ_WEIGHTED)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = paste("Weighted ALT_FREQ distribution (non-zero bins, ", BIN_SIZE_LABEL, ")", sep=""),
x = "ALT_FREQ_WEIGHTED") +
theme_minimal()
p2 <- ggplot(non_zero_bins, aes(x = asin)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Arcsin transformation (non-zero bins)",
x = "asin(sqrt(ALT_FREQ_WEIGHTED))") +
theme_minimal()
p3 <- ggplot(non_zero_bins, aes(x = log)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Log10 transformation (non-zero bins)",
x = "log10(ALT_FREQ_WEIGHTED)") +
theme_minimal()
ggpubr::ggarrange(p1, p2, p3, common.legend = TRUE)
K <- 3
# Apply K-means clustering with k=3 to each transformation for non-zero bins
non_zero_bins$K <- as.factor(Ckmeans.1d.dp(non_zero_bins$ALT_FREQ_WEIGHTED, K)$cluster)
non_zero_bins$Klog <- as.factor(Ckmeans.1d.dp(non_zero_bins$log, K)$cluster)
non_zero_bins$Kasin <- as.factor(Ckmeans.1d.dp(non_zero_bins$asin, K)$cluster)
# Function to relabel clusters as REF, HET, ALT based on ALT_FREQ
relabel_clusters <- function(clusters, data) {
# Calculate mean ALT_FREQ for each cluster
cluster_means <- tapply(data$ALT_FREQ, clusters, mean)
# Order clusters by mean ALT_FREQ
ordered_clusters <- order(cluster_means)
# Create mapping from original cluster to REF, HET, ALT
cluster_map <- rep(NA, length(unique(as.numeric(clusters))))
cluster_map[ordered_clusters[1]] <- "REF"
cluster_map[ordered_clusters[2]] <- "HET"
cluster_map[ordered_clusters[3]] <- "ALT"
# Apply mapping to original clusters
return(factor(cluster_map[as.numeric(clusters)], levels = c("REF", "HET", "ALT")))
}
# Apply relabeling to each clustering method
non_zero_bins$K <- relabel_clusters(non_zero_bins$K, non_zero_bins)
non_zero_bins$Klog <- relabel_clusters(non_zero_bins$Klog, non_zero_bins)
non_zero_bins$Kasin <- relabel_clusters(non_zero_bins$Kasin, non_zero_bins)
# Check cluster proportions for each method
cat("K-means cluster proportions (non-zero bins):\n")
## K-means cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.969 0.018 0.012
##
## Log-transformed K-means cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.347 0.620 0.033
##
## Arcsin-transformed K-means cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.968 0.019 0.014
# Fit Gaussian mixture model with 3 components
normalest <- REBMIX(
Dataset = list(data.frame(Value = non_zero_bins$ALT_FREQ)),
Preprocessing = "histogram",
cmin = 3, # Force 3 components
cmax = 3,
Criterion = "BIC",
pdf = "normal"
)
# Assign clusters
normclu <- RCLRMIX(x = normalest)
non_zero_bins$Kgmm <- as.factor(normclu@Zp)
# Relabel Kgmm clusters
non_zero_bins$Kgmm <- relabel_clusters(non_zero_bins$Kgmm, non_zero_bins)
# Check Kgmm cluster proportions
cat("Gaussian Mixture Model cluster proportions (non-zero bins):\n")
## Gaussian Mixture Model cluster proportions (non-zero bins):
##
## REF HET ALT
## 0.956 0.017 0.026
# Create new columns in read_freq for each clustering method
read_freq$K <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Klog <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kasin <- factor(NA, levels = c("REF", "HET", "ALT"))
read_freq$Kgmm <- factor(NA, levels = c("REF", "HET", "ALT"))
# Assign the non-zero bin clustering results to the main dataset
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
# Create an index for non-zero bins in the original data
non_zero_idx <- which(read_freq$ALT_FREQ_WEIGHTED > 0)
# Assign cluster classifications to non-zero bins
read_freq[[method]][non_zero_idx] <- non_zero_bins[[method]]
}
# Force bins with ALT_FREQ = 0 to be "REF"
zero_idx <- which(read_freq$ALT_FREQ == 0)
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
read_freq[[method]][zero_idx] <- "REF"
}
# Verify the merging worked correctly
cat("Final cluster proportions for raw K-means:\n")
## Final cluster proportions for raw K-means:
##
## REF HET ALT
## 0.970 0.018 0.012
For maize introgression lines produced through backcrossing followed by selfing (e.g., BC2S3), we can calculate expected genotype frequencies based on Mendelian genetics:
# Create mating matrices
create_mating_matrices <- function() {
# Backcross matrix for crossing population with AA donor
# Each COLUMN represents offspring distribution from that parent × AA
# Column 1: AA × AA → [1, 0, 0] (100% AA offspring)
# Column 2: Aa × AA → [1/2, 1/2, 0] (50% AA, 50% Aa offspring)
# Column 3: aa × AA → [0, 1, 0] (100% Aa offspring)
backcross_AA <- matrix(c(
1, 1/2, 0,
0, 1/2, 1,
0, 0, 0
), nrow = 3, byrow = TRUE)
# Backcross matrix for crossing population with aa donor
# Column 1: AA × aa → [0, 1, 0] (100% Aa offspring)
# Column 2: Aa × aa → [0, 1/2, 1/2] (50% Aa, 50% aa offspring)
# Column 3: aa × aa → [0, 0, 1] (100% aa offspring)
backcross_aa <- matrix(c(
0, 0, 0,
1, 1/2, 0,
0, 1/2, 1
), nrow = 3, byrow = TRUE)
# Selfing matrix - each individual self-fertilizes
# Column 1: AA selfed → [1, 0, 0] (100% AA offspring)
# Column 2: Aa selfed → [1/4, 1/2, 1/4] (25% AA, 50% Aa, 25% aa offspring)
# Column 3: aa selfed → [0, 0, 1] (100% aa offspring)
selfing <- matrix(c(
1, 1/4, 0,
0, 1/2, 0,
0, 1/4, 1
), nrow = 3, byrow = TRUE)
return(list(
backcross_AA = backcross_AA,
backcross_aa = backcross_aa,
selfing = selfing
))
}
# Function to calculate genotype frequencies after breeding scheme
calculate_nil_frequencies <- function(bc=2, s=3, donor_type = "aa", from = c(0,1,0) ) {
# Input validation
if (bc < 0 || s < 0) {
stop("Number of backcrosses and self generations must be non-negative")
}
if (!donor_type %in% c("AA", "aa")) {
stop("Donor type must be 'AA' or 'aa'")
}
# Get mating matrices
matrices <- create_mating_matrices()
# Initial F1 generation - all heterozygous after crossing pure lines
# Population vector: [AA frequency, Aa frequency, aa frequency]
current_population <- matrix(from, ncol = 1)
# Apply backcrosses sequentially
if (bc > 0) {
# Select appropriate backcross matrix based on donor type
backcross_matrix <- if (donor_type == "aa") {
matrices$backcross_AA
} else {
matrices$backcross_aa
}
# Each backcross: Matrix %*% population_vector
# This takes weighted average of matrix columns based on current population
for (i in 1:bc) {
current_population <- backcross_matrix %*% current_population
# Optional: print intermediate results for debugging
if (exists("debug_nil") && debug_nil) {
cat("After backcross", i, ": [",
round(current_population[1,1], 4), ", ",
round(current_population[2,1], 4), ", ",
round(current_population[3,1], 4), "]\n")
}
}
}
# Apply selfing generations sequentially
if (s > 0) {
# Each selfing: selfing_matrix %*% population_vector
# Each column represents offspring from selfing that genotype
for (i in 1:s) {
current_population <- matrices$selfing %*% current_population
# Optional: print intermediate results for debugging
if (exists("debug_nil") && debug_nil) {
cat("After self", i, ": [",
round(current_population[1,1], 4), ", ",
round(current_population[2,1], 4), ", ",
round(current_population[3,1], 4), "]\n")
}
}
}
# Extract final frequencies as named vector
final_frequencies <- as.vector(current_population)
names(final_frequencies) <- c("AA", "Aa", "aa")
return(final_frequencies)
}
# Function to return frequencies in HMM format (REF, HET, ALT)
nil_frequencies_for_hmm <- function(bc, s, donor_type = "aa", from = c(0,1,0)) {
# Calculate raw genotype frequencies
genotype_freqs <- calculate_nil_frequencies(bc, s, donor_type, from)
# Convert to HMM format based on which allele is being introgressed
if (donor_type == "AA") {
# AA is the introgressed (donor) allele
# aa represents the recurrent parent background (REF)
# AA represents regions with donor allele (ALT)
hmm_frequencies <- c(
REF = genotype_freqs["aa"], # Recurrent parent background
HET = genotype_freqs["Aa"], # Heterozygous regions
ALT = genotype_freqs["AA"] # Donor introgression
)
} else {
# aa is the introgressed (donor) allele
# AA represents the recurrent parent background (REF)
# aa represents regions with donor allele (ALT)
hmm_frequencies <- c(
REF = genotype_freqs["AA"], # Recurrent parent background
HET = genotype_freqs["Aa"], # Heterozygous regions
ALT = genotype_freqs["aa"] # Donor introgression
)
}
names(hmm_frequencies) <- c("REF", "HET","ALT")
return(hmm_frequencies)
}
bc2s3 <- nil_frequencies_for_hmm(bc=2,s=3)
cat("Expected genotype frequencies for BC2S3:\n")
## Expected genotype frequencies for BC2S3:
## REF: 0.8594
## HET: 0.0312
## ALT: 0.1094
# Function to apply HMM smoothing to ancestry calls
smooth_ancestry_with_hmm <- function(genotypes, transitions = c(0.96, 0.04)) {
# Convert genotypes to numeric (0=REF, 1=HET, 2=ALT)
geno_numeric <- as.numeric(factor(genotypes, levels=c("REF", "HET", "ALT"))) - 1
# Set up transition probabilities (high probability of staying in same state)
trans_prob <- matrix(c(
transitions[1], transitions[2]/2, transitions[2]/2, # From REF
transitions[2]/2, transitions[1], transitions[2]/2, # From HET
transitions[2]/2, transitions[2]/2, transitions[1] # From ALT
), nrow=3, byrow=TRUE)
# Set up emission probabilities (how likely each state produces observed genotypes)
emiss_prob <- matrix(c(
0.9, 0.08, 0.02, # REF state
0.1, 0.8, 0.1, # HET state
0.02, 0.08, 0.9 # ALT state
), nrow=3, byrow=TRUE)
# Initialize HMM with BC2S3 priors
hmm <- initHMM(c("REF", "HET", "ALT"), c("0", "1", "2"),
startProbs = bc2s3, # Prior probabilities from breeding design
transProbs = trans_prob,
emissionProbs = emiss_prob)
# Run Viterbi algorithm to find most likely state sequence
viterbi_path <- viterbi(hmm, as.character(geno_numeric))
# Convert back to genotype calls
smoothed_genotypes <- factor(viterbi_path, levels=c("REF", "HET", "ALT"))
return(smoothed_genotypes)
}
# Apply HMM smoothing to each chromosome separately
read_freq <- read_freq %>%
group_by(CONTIG) %>%
mutate(
K_HMM = smooth_ancestry_with_hmm(K, transitions=c(0.995, 0.005)),
Kasin_HMM = smooth_ancestry_with_hmm(Kasin, transitions=c(0.995, 0.005)),
Klog_HMM = smooth_ancestry_with_hmm(Klog, transitions=c(0.995, 0.005)),
Kgmm_HMM = smooth_ancestry_with_hmm(Kgmm, transitions=c(0.995, 0.005))
) %>%
ungroup()
# Ensure chromosome factor ordering
read_freq$CONTIG <- factor(read_freq$CONTIG, levels = paste0("chr", 1:10))
# Define color palette
pal <- c("REF" = "gold", "HET" = "springgreen4", "ALT" = "purple4")
# Create tile plot function
create_tile <- function(data, method_col, title) {
data %>%
mutate(
CONTIG = factor(CONTIG, levels = paste0("chr", 1:10)),
POS = BIN_POS * BIN_SIZE
) %>%
ggplot(aes(x = POS, y = 1, fill = .data[[method_col]])) +
geom_tile() +
facet_wrap( .~CONTIG, ncol = 1, strip.position = "left") +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
scale_fill_manual(values = pal) +
labs(title = title, x = "Position (bp)", fill = "Genotype") +
ggpubr::theme_classic2(base_size = 10) +
theme(
legend.position = "bottom",
strip.background = element_blank(),
strip.text.y.left = element_text(angle = 0, hjust = 0),
panel.spacing.y = unit(0.0, "lines"),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank()
)
}
# Compare before and after HMM smoothing
create_tile(read_freq, "Kgmm", paste("Before HMM Smoothing |", BIN_SIZE_LABEL, "bins"))
# Create a combined plot for all methods including HMM
read_freq %>%
pivot_longer(cols = starts_with("K"), names_to = "method", values_to = "genotype") %>%
mutate(
POS = BIN_POS * BIN_SIZE,
method = factor(method,
levels = c("K", "K_HMM", "Klog", "Klog_HMM", "Kasin", "Kasin_HMM", "Kgmm", "Kgmm_HMM"),
labels = c("K-means (weighted)", "K-means (weighted)\nHMM",
"K-means (log10)", "K-means (log10)\nHMM",
"K-means (arcsin)", "K-means (arcsin)\nHMM",
"Gaussian Mixture", "Gaussian Mixture]\nHMM"))
) %>%
ggplot(aes(x = POS, y = 1, fill = genotype)) +
geom_tile() +
scale_fill_manual(values = pal) +
facet_grid(method ~ CONTIG, scales = "free") +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
ggpubr::theme_classic2() +
theme(
legend.position = "bottom",
strip.background = element_blank(),
strip.text.x = element_text(hjust = 0),
strip.text.y = element_text(angle = 0, face = "bold"),
panel.spacing.x = unit(0.0, "lines"),
panel.spacing.y = unit(0.0, "lines"),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.line.x = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank()
) +
labs(
title = paste("Comparison of Clustering Methods with HMM Smoothing (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Genome Position (Mb)",
fill = "Genotype"
)
# Create a summary of genotype frequencies for all methods
genotype_freq <- read_freq %>%
pivot_longer(cols = c(K, Klog, Kasin, Kgmm, K_HMM, Klog_HMM, Kasin_HMM, Kgmm_HMM),
names_to = "method",
values_to = "genotype") %>%
group_by(method, genotype) %>%
summarize(
count = n(),
percentage = n() / nrow(read_freq) * 100,
.groups = "drop"
) %>%
mutate(
method_type = ifelse(grepl("_HMM", method), "HMM Smoothed", "Original"),
method_base = gsub("_HMM", "", method),
method = case_when(
method == "K" ~ "K-means (weighted)",
method == "K_HMM" ~ "K-means (weighted) HMM",
method == "Klog" ~ "K-means (log10)",
method == "Klog_HMM" ~ "K-means (log10) HMM",
method == "Kasin" ~ "K-means (arcsin)",
method == "Kasin_HMM" ~ "K-means (arcsin) HMM",
method == "Kgmm" ~ "Gaussian Mixture",
method == "Kgmm_HMM" ~ "Gaussian Mixture HMM",
TRUE ~ method
)
)
# Add expected frequencies
expectation <- data.frame(
method = "Expected (BC2S3)",
genotype = c("REF", "HET", "ALT"),
count = round(bc2s3 * nrow(read_freq), 0),
percentage = 100 * bc2s3,
method_type = "Theoretical",
method_base = "Expected"
)
# Combine and create frequency table
method_freq <- rbind(expectation, genotype_freq) %>%
select(method, genotype, percentage) %>%
pivot_wider(
id_cols = genotype,
names_from = method,
values_from = percentage
)
# Display frequency table
method_freq %>%
knitr::kable(
caption = paste("Genotype frequencies (%) across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
digits = 1
)
genotype | Expected (BC2S3) | K-means (weighted) | K-means (weighted) HMM | K-means (arcsin) | K-means (arcsin) HMM | Gaussian Mixture | Gaussian Mixture HMM | K-means (log10) | K-means (log10) HMM |
---|---|---|---|---|---|---|---|---|---|
REF | 85.9 | 97.0 | 97.4 | 96.8 | 97.4 | 95.6 | 97.0 | 34.9 | 22.7 |
HET | 3.1 | 1.8 | 1.1 | 1.9 | 1.1 | 1.7 | 0.5 | 61.8 | 74.1 |
ALT | 10.9 | 1.2 | 1.5 | 1.4 | 1.5 | 2.6 | 2.5 | 3.3 | 3.2 |
# Visualize frequencies
rbind(expectation, genotype_freq) %>%
ggplot(aes(x = reorder(method, percentage), y = percentage, fill = genotype)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = pal) +
labs(
title = paste("Genotype frequencies across methods (", BIN_SIZE_LABEL, " bins)", sep=""),
x = "Method",
y = "Percentage (%)",
fill = "Genotype"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
) +
coord_flip()
# Calculate agreement between methods including HMM versions
methods <- c("K", "Klog", "Kasin", "Kgmm", "K_HMM", "Klog_HMM", "Kasin_HMM", "Kgmm_HMM")
agreement_matrix <- matrix(NA, length(methods), length(methods))
colnames(agreement_matrix) <- methods
rownames(agreement_matrix) <- methods
for (i in 1:length(methods)) {
for (j in 1:length(methods)) {
agreement_matrix[i, j] <- mean(read_freq[[methods[i]]] == read_freq[[methods[j]]]) * 100
}
}
# Display agreement matrix
knitr::kable(agreement_matrix, caption = "Percentage agreement between methods", digits = 1)
K | Klog | Kasin | Kgmm | K_HMM | Klog_HMM | Kasin_HMM | Kgmm_HMM | |
---|---|---|---|---|---|---|---|---|
K | 100.0 | 36.1 | 99.7 | 97.3 | 98.8 | 23.9 | 98.8 | 98.2 |
Klog | 36.1 | 100.0 | 36.3 | 38.2 | 36.3 | 74.1 | 36.3 | 37.2 |
Kasin | 99.7 | 36.3 | 100.0 | 97.6 | 98.8 | 24.1 | 98.8 | 98.4 |
Kgmm | 97.3 | 38.2 | 97.6 | 100.0 | 97.3 | 25.7 | 97.3 | 98.2 |
K_HMM | 98.8 | 36.3 | 98.8 | 97.3 | 100.0 | 24.2 | 100.0 | 98.9 |
Klog_HMM | 23.9 | 74.1 | 24.1 | 25.7 | 24.2 | 100.0 | 24.2 | 25.1 |
Kasin_HMM | 98.8 | 36.3 | 98.8 | 97.3 | 100.0 | 24.2 | 100.0 | 98.9 |
Kgmm_HMM | 98.2 | 37.2 | 98.4 | 98.2 | 98.9 | 25.1 | 98.9 | 100.0 |
# Visualize agreement matrix
agreement_df <- as.data.frame(agreement_matrix) %>%
tibble::rownames_to_column("Method1") %>%
pivot_longer(-Method1, names_to = "Method2", values_to = "Agreement")
ggplot(agreement_df, aes(x = Method1, y = Method2, fill = Agreement)) +
geom_tile() +
geom_text(aes(label = round(Agreement, 1)), color = "white", size = 3) +
scale_fill_gradient(low = "steelblue", high = "darkred") +
labs(
title = "Agreement Between Methods (%)",
x = NULL,
y = NULL,
fill = "Agreement (%)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text = element_text(face = "bold"),
panel.grid = element_blank()
)
# Calculate how much each method changed after HMM smoothing
hmm_effects <- data.frame(
method = c("K", "Klog", "Kasin", "Kgmm"),
changes = c(
mean(read_freq$K != read_freq$K_HMM) * 100,
mean(read_freq$Klog != read_freq$Klog_HMM) * 100,
mean(read_freq$Kasin != read_freq$Kasin_HMM) * 100,
mean(read_freq$Kgmm != read_freq$Kgmm_HMM) * 100
)
)
# Calculate direction of changes (transitions between states)
transition_summary <- data.frame()
for (method in c("K", "Klog", "Kasin", "Kgmm")) {
hmm_method <- paste0(method, "_HMM")
transitions <- table(read_freq[[method]], read_freq[[hmm_method]])
# Calculate off-diagonal (changed) proportions
total_changes <- sum(transitions) - sum(diag(transitions))
for (i in rownames(transitions)) {
for (j in colnames(transitions)) {
if (i != j && transitions[i, j] > 0) {
transition_summary <- rbind(transition_summary, data.frame(
method = method,
from = i,
to = j,
count = transitions[i, j],
percentage = transitions[i, j] / nrow(read_freq) * 100
))
}
}
}
}
# Display HMM effects
cat("Percentage of bins changed by HMM smoothing:\n")
## Percentage of bins changed by HMM smoothing:
hmm_effects %>%
mutate(method = case_when(
method == "K" ~ "K-means (weighted)",
method == "Klog" ~ "K-means (log10)",
method == "Kasin" ~ "K-means (arcsin)",
method == "Kgmm" ~ "Gaussian Mixture",
TRUE ~ method
)) %>%
knitr::kable(col.names = c("Method", "% Bins Changed"), digits = 2)
Method | % Bins Changed |
---|---|
K-means (weighted) | 1.22 |
K-means (log10) | 25.94 |
K-means (arcsin) | 1.22 |
Gaussian Mixture | 1.78 |
##
## Detailed transition patterns:
transition_summary %>%
mutate(
transition = paste(from, "â", to),
method = case_when(
method == "K" ~ "K-means (weighted)",
method == "Klog" ~ "K-means (log10)",
method == "Kasin" ~ "K-means (arcsin)",
method == "Kgmm" ~ "Gaussian Mixture",
TRUE ~ method
)
) %>%
select(method, transition, count, percentage) %>%
knitr::kable(digits = 2)
method | transition | count | percentage |
---|---|---|---|
K-means (weighted) | REF â HET | 3 | 0.14 |
K-means (weighted) | REF â ALT | 2 | 0.09 |
K-means (weighted) | HET â REF | 13 | 0.61 |
K-means (weighted) | HET â ALT | 6 | 0.28 |
K-means (weighted) | ALT â REF | 2 | 0.09 |
K-means (log10) | REF â HET | 403 | 18.90 |
K-means (log10) | REF â ALT | 2 | 0.09 |
K-means (log10) | HET â REF | 143 | 6.71 |
K-means (log10) | HET â ALT | 1 | 0.05 |
K-means (log10) | ALT â REF | 1 | 0.05 |
K-means (log10) | ALT â HET | 3 | 0.14 |
K-means (arcsin) | REF â HET | 2 | 0.09 |
K-means (arcsin) | REF â ALT | 1 | 0.05 |
K-means (arcsin) | HET â REF | 15 | 0.70 |
K-means (arcsin) | HET â ALT | 5 | 0.23 |
K-means (arcsin) | ALT â REF | 2 | 0.09 |
K-means (arcsin) | ALT â HET | 1 | 0.05 |
Gaussian Mixture | REF â ALT | 2 | 0.09 |
Gaussian Mixture | HET â REF | 28 | 1.31 |
Gaussian Mixture | HET â ALT | 2 | 0.09 |
Gaussian Mixture | ALT â REF | 3 | 0.14 |
Gaussian Mixture | ALT â HET | 3 | 0.14 |
# Visualize HMM effects
ggplot(hmm_effects, aes(x = reorder(method, changes), y = changes)) +
geom_col(fill = "steelblue", alpha = 0.7) +
geom_text(aes(label = paste0(round(changes, 1), "%")),
hjust = -0.1, size = 4) +
labs(
title = "Effect of HMM Smoothing on Different Clustering Methods",
x = "Clustering Method",
y = "Percentage of Bins Changed (%)"
) +
theme_minimal() +
coord_flip()
# Analyze where methods disagree most
disagreement_data <- read_freq %>%
mutate(
POS = BIN_POS * BIN_SIZE,
# Check if all original methods agree
original_agreement = factor(ifelse(
K == Klog & K == Kasin & K == Kgmm,
"All methods agree",
"Methods disagree"
), levels = c("All methods agree", "Methods disagree")),
# Check if all HMM methods agree
hmm_agreement = factor(ifelse(
K_HMM == Klog_HMM & K_HMM == Kasin_HMM & K_HMM == Kgmm_HMM,
"All methods agree",
"Methods disagree"
), levels = c("All methods agree", "Methods disagree"))
)
# Calculate disagreement statistics
original_disagreement <- mean(disagreement_data$original_agreement == "Methods disagree") * 100
hmm_disagreement <- mean(disagreement_data$hmm_agreement == "Methods disagree") * 100
cat("Original methods disagreement:", round(original_disagreement, 1), "%\n")
## Original methods disagreement: 64.1 %
## HMM smoothed methods disagreement: 75.9 %
cat("Reduction in disagreement:", round(original_disagreement - hmm_disagreement, 1), "percentage points\n")
## Reduction in disagreement: -11.8 percentage points
# Create tile plot showing disagreement regions
p1 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = original_agreement)) +
geom_tile() +
scale_fill_manual(values = c("darkgreen", "darkred")) +
facet_grid(. ~ CONTIG, scales = "free_x") +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
labs(title = "Original Methods Agreement/Disagreement", fill = NULL) +
theme_minimal() +
theme(
legend.position = "bottom",
strip.background = element_blank(),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0.0, "lines")
)
p2 <- ggplot(disagreement_data, aes(x = POS, y = 1, fill = hmm_agreement)) +
geom_tile() +
scale_fill_manual(values = c("darkgreen", "darkred")) +
facet_grid(. ~ CONTIG, scales = "free_x") +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
labs(title = "HMM Smoothed Methods Agreement/Disagreement",
x = "Genome Position (Mb)", fill = NULL) +
theme_minimal() +
theme(
legend.position = "bottom",
strip.background = element_blank(),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
panel.spacing.x = unit(0.0, "lines")
)
ggpubr::ggarrange(p1, p2, ncol = 1, common.legend = TRUE)
Based on this comprehensive analysis of ancestry calling methods with HMM smoothing:
Method Performance: All clustering methods show reasonable performance, with the Gaussian Mixture Model and arcsin-transformed K-means showing slightly better separation of genotype classes.
HMM Smoothing Benefits:
Genotype Frequency Alignment: HMM smoothing brings observed frequencies closer to theoretical BC2S3 expectations, particularly for ALT regions.
Kgmm_HMM
) for final
ancestry calls, as it:
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS 15.3.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tibble_3.2.1 ggpubr_0.6.0 scales_1.4.0
## [4] tidyr_1.3.1 HMM_1.0.1 rebmix_2.16.0
## [7] Ckmeans.1d.dp_4.3.5 ggplot2_3.5.2 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 rstatix_0.7.2 digest_0.6.37
## [5] magrittr_2.0.3 evaluate_1.0.3 grid_4.4.1 RColorBrewer_1.1-3
## [9] fastmap_1.2.0 jsonlite_2.0.0 backports_1.5.0 Formula_1.2-5
## [13] gridExtra_2.3 purrr_1.0.4 jquerylib_0.1.4 abind_1.4-8
## [17] Rdpack_2.6.4 cli_3.6.5 rlang_1.1.6 rbibutils_2.3
## [21] cowplot_1.1.3 withr_3.0.2 cachem_1.1.0 yaml_2.3.10
## [25] tools_4.4.1 ggsignif_0.6.4 broom_1.0.8 vctrs_0.6.5
## [29] R6_2.6.1 lifecycle_1.0.4 car_3.1-3 pkgconfig_2.0.3
## [33] pillar_1.10.2 bslib_0.9.0 gtable_0.3.6 glue_1.8.0
## [37] Rcpp_1.0.14 xfun_0.52 tidyselect_1.2.1 rstudioapi_0.17.1
## [41] knitr_1.50 dichromat_2.0-0.1 farver_2.1.2 htmltools_0.5.8.1
## [45] labeling_0.4.3 rmarkdown_2.29 carData_3.0-5 compiler_4.4.1