First, we load and process the three different datasets. Each dataset
requires a slightly different approach to calculate the key metrics:
mean coverage (lambda
) and observed missingness
(missing_obs
).
This dataset is derived from WGSmetrics_summary.tsv
.
# Load the first dataset
# Make sure to replace this path with the actual path to your file
base_dir <- "/Volumes/BZea/bzeaseq"
wgs_file <- file.path(base_dir, "WGSmetrics_summary.tsv")
wgs_df <- read_tsv(wgs_file, comment = "#")
## Rows: 1437 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): SAMPLE, FOLD_80_BASE_PENALTY, FOLD_90_BASE_PENALTY, FOLD_95_BASE_P...
## dbl (29): GENOME_TERRITORY, MEAN_COVERAGE, SD_COVERAGE, MEDIAN_COVERAGE, MAD...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Compute per-sample coverage and missingness statistics
per_sample_wgs <- wgs_df %>%
transmute( # transmute keeps only the new columns
SAMPLE = SAMPLE,
lambda = MEAN_COVERAGE,
missing_obs = 1 - PCT_1X
) %>%
filter(is.finite(lambda) & is.finite(missing_obs))
head(per_sample_wgs)
## # A tibble: 6 × 3
## SAMPLE lambda missing_obs
## <chr> <dbl> <dbl>
## 1 PN10_SID865 0.262 0.806
## 2 PN10_SID866 0.907 0.558
## 3 PN10_SID867 0.534 0.680
## 4 PN10_SID868 0.608 0.651
## 5 PN10_SID869 0.545 0.683
## 6 PN10_SID870 0.637 0.640
This dataset is derived from
all_samples_bin_genotypes.tsv
.
# Load the second dataset
# Make sure to replace this path with the actual path to your file
base_dir <- "/Volumes/BZea/bzeaseq/ancestry"
wideseq_file <- file.path(base_dir, "all_samples_bin_genotypes.tsv")
wideseq_df <- read_tsv(wideseq_file, comment = "#")
## Rows: 3057288 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): SAMPLE, CONTIG, GENOTYPE, Kraw, Kasin, Kgmm, Kraw_HMM, Kasin_HMM, K...
## dbl (8): BIN_POS, VARIANT_COUNT, INFORMATIVE_VARIANT_COUNT, DEPTH_SUM, ALT_C...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Compute per-sample coverage and missingness by summarizing bins
per_sample_bzea <- wideseq_df %>%
group_by(SAMPLE) %>%
summarize(
VARIANT_COUNT = sum(VARIANT_COUNT, na.rm = TRUE),
INFORMATIVE_VARIANT_COUNT = sum(INFORMATIVE_VARIANT_COUNT, na.rm = TRUE),
DEPTH_SUM = sum(DEPTH_SUM, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
lambda = DEPTH_SUM / VARIANT_COUNT,
missing_obs = 1 - INFORMATIVE_VARIANT_COUNT / VARIANT_COUNT
) %>%
select(SAMPLE, lambda, missing_obs) %>%
filter(is.finite(lambda) & is.finite(missing_obs))
head(per_sample_bzea)
## # A tibble: 6 × 3
## SAMPLE lambda missing_obs
## <chr> <dbl> <dbl>
## 1 PN10_SID865 0.415 0.739
## 2 PN10_SID866 1.40 0.446
## 3 PN10_SID867 0.867 0.573
## 4 PN10_SID868 0.945 0.550
## 5 PN10_SID869 0.888 0.575
## 6 PN10_SID870 0.998 0.534
This dataset is derived from allelic_counts50K.tsv
.
# Load the third dataset
# Make sure to replace this path with the actual path to your file
base_dir <- "/Volumes/BZea/bzeaseq/50K"
snp50K_file <- file.path(base_dir, "allelic_counts50K.tsv")
snp50K_df <- read_tsv(snp50K_file, comment = "@")
## Rows: 74815049 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): SAMPLE, CONTIG, REF_NUCLEOTIDE, ALT_NUCLEOTIDE
## dbl (3): POSITION, REF_COUNT, ALT_COUNT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Compute per-sample coverage and missingness by summarizing SNPs
per_sample_snp50K <- snp50K_df %>%
group_by(SAMPLE) %>%
summarize(
VARIANT_COUNT = n(),
INFORMATIVE_VARIANT_COUNT = sum((REF_COUNT + ALT_COUNT) > 0, na.rm = TRUE),
DEPTH_SUM = sum(REF_COUNT + ALT_COUNT, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
lambda = DEPTH_SUM / VARIANT_COUNT,
missing_obs = 1 - INFORMATIVE_VARIANT_COUNT / VARIANT_COUNT
) %>%
select(SAMPLE, lambda, missing_obs) %>%
filter(is.finite(lambda) & is.finite(missing_obs))
head(per_sample_snp50K)
## # A tibble: 6 × 3
## SAMPLE lambda missing_obs
## <chr> <dbl> <dbl>
## 1 PN10_SID865 0.313 0.769
## 2 PN10_SID866 0.961 0.466
## 3 PN10_SID867 0.666 0.586
## 4 PN10_SID868 0.676 0.579
## 5 PN10_SID869 0.697 0.579
## 6 PN10_SID870 0.724 0.558
Now, we apply the fit_exp_floor_model
function to all
three processed datasets. This function will fit the model and return
the results, including the key parameters \(\pi\) and \(k\).
# Fit the model for the WGS dataset
wgs_model_results <- fit_exp_floor_model(per_sample_wgs)
# Fit the model for the BZea binned dataset
bzea_model_results <- fit_exp_floor_model(per_sample_bzea)
# Fit the model for the SNP50K dataset
snp50K_model_results <- fit_exp_floor_model(per_sample_snp50K)
# Print the fitted parameters for comparison
cat("WGS Metrics Model Parameters:\n")
## WGS Metrics Model Parameters:
cat("Floor (π):", round(wgs_model_results$params$pi, 3), "\n")
## Floor (π): 0.446
cat("Decay (k):", round(wgs_model_results$params$k, 3), "\n\n")
## Decay (k): 1.661
cat("BZea Binned Model Parameters:\n")
## BZea Binned Model Parameters:
cat("Floor (π):", round(bzea_model_results$params$pi, 3), "\n")
## Floor (π): 0.346
cat("Decay (k):", round(bzea_model_results$params$k, 3), "\n\n")
## Decay (k): 1.256
cat("SNP50K Model Parameters:\n")
## SNP50K Model Parameters:
cat("Floor (π):", round(snp50K_model_results$params$pi, 3), "\n")
## Floor (π): 0.161
cat("Decay (k):", round(snp50K_model_results$params$k, 3), "\n")
## Decay (k): 1.042
Finally, we generate the validation plots for all three models and
display them side-by-side using cowplot::plot_grid
to
facilitate a direct visual comparison.
# Generate the coverage histograms for all datasets
hist_wgs <- generate_coverage_histogram(wgs_model_results$data)
hist_bzea <- generate_coverage_histogram(bzea_model_results$data)
hist_snp50K <- generate_coverage_histogram(snp50K_model_results$data)
# Generate the validation plot for the WGS dataset
plot_wgs <- generate_floor_model_plot(
model_results = wgs_model_results,
coverage_hist_plot = hist_wgs,
plot_title = "Whole Genome (0.8X goal)"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Generate the validation plot for the BZea dataset
plot_bzea <- generate_floor_model_plot(
model_results = bzea_model_results,
coverage_hist_plot = hist_bzea,
plot_title = "Wideseq (27M Teosinte SNPs)"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Generate the validation plot for the SNP50K dataset
plot_snp50K <- generate_floor_model_plot(
model_results = snp50K_model_results,
coverage_hist_plot = hist_snp50K,
plot_title = "Filtered SNPs (50K)"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Arrange all three plots side-by-side
combined_plot <- plot_grid(plot_wgs, plot_bzea, plot_snp50K, ncol = 3)
# Display the combined plot
print(combined_plot)
Let’s create a summary table comparing the key statistics across all three datasets:
# Create a summary table comparing all three datasets
summary_stats <- data.frame(
Dataset = c("WGS", "Wideseq", "SNP50K"),
Samples = c(nrow(per_sample_wgs), nrow(per_sample_bzea), nrow(per_sample_snp50K)),
Mean_Coverage = c(
round(mean(per_sample_wgs$lambda), 2),
round(mean(per_sample_bzea$lambda), 2),
round(mean(per_sample_snp50K$lambda), 2)
),
Mean_Missingness = c(
round(mean(per_sample_wgs$missing_obs), 3),
round(mean(per_sample_bzea$missing_obs), 3),
round(mean(per_sample_snp50K$missing_obs), 3)
),
Floor_Pi = c(
round(wgs_model_results$params$pi, 3),
round(bzea_model_results$params$pi, 3),
round(snp50K_model_results$params$pi, 3)
),
Decay_k = c(
round(wgs_model_results$params$k, 3),
round(bzea_model_results$params$k, 3),
round(snp50K_model_results$params$k, 3)
)
)
# Display the summary table
knitr::kable(summary_stats,
caption = "Comparison of Model Parameters Across Datasets",
col.names = c("Dataset", "N Samples", "Mean Coverage", "Mean Missingness",
"Floor (π)", "Decay (k)"))
Dataset | N Samples | Mean Coverage | Mean Missingness | Floor (π) | Decay (k) |
---|---|---|---|---|---|
WGS | 1437 | 0.39 | 0.744 | 0.446 | 1.661 |
Wideseq | 1434 | 0.59 | 0.670 | 0.346 | 1.256 |
SNP50K | 1439 | 0.43 | 0.703 | 0.161 | 1.042 |