Comparing Estimations of missing data from Bzea

1 1. Data Loading and Processing

First, we load and process the three different datasets. Each dataset requires a slightly different approach to calculate the key metrics: mean coverage (lambda) and observed missingness (missing_obs).

1.1 Dataset 1: WGS Metrics Summary

This dataset is derived from WGSmetrics_summary.tsv.

# Load the first dataset
# Make sure to replace this path with the actual path to your file
base_dir <- "/Volumes/BZea/bzeaseq"
wgs_file <- file.path(base_dir, "WGSmetrics_summary.tsv")
wgs_df <- read_tsv(wgs_file, comment = "#")

## Rows: 1437 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (4): SAMPLE, FOLD_80_BASE_PENALTY, FOLD_90_BASE_PENALTY, FOLD_95_BASE_P...
## dbl (29): GENOME_TERRITORY, MEAN_COVERAGE, SD_COVERAGE, MEDIAN_COVERAGE, MAD...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Compute per-sample coverage and missingness statistics
per_sample_wgs <- wgs_df %>%
  transmute( # transmute keeps only the new columns
    SAMPLE = SAMPLE,
    lambda = MEAN_COVERAGE,
    missing_obs = 1 - PCT_1X
  ) %>%
  filter(is.finite(lambda) & is.finite(missing_obs))

head(per_sample_wgs)

## # A tibble: 6 × 3
##   SAMPLE      lambda missing_obs
##   <chr>        <dbl>       <dbl>
## 1 PN10_SID865  0.262       0.806
## 2 PN10_SID866  0.907       0.558
## 3 PN10_SID867  0.534       0.680
## 4 PN10_SID868  0.608       0.651
## 5 PN10_SID869  0.545       0.683
## 6 PN10_SID870  0.637       0.640

1.2 Dataset 2: BZea Wideseq (27M Binned Genotypes)

This dataset is derived from all_samples_bin_genotypes.tsv.

# Load the second dataset
# Make sure to replace this path with the actual path to your file
base_dir <- "/Volumes/BZea/bzeaseq/ancestry"
wideseq_file <- file.path(base_dir, "all_samples_bin_genotypes.tsv")
wideseq_df <- read_tsv(wideseq_file, comment = "#")

## Rows: 3057288 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): SAMPLE, CONTIG, GENOTYPE, Kraw, Kasin, Kgmm, Kraw_HMM, Kasin_HMM, K...
## dbl (8): BIN_POS, VARIANT_COUNT, INFORMATIVE_VARIANT_COUNT, DEPTH_SUM, ALT_C...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Compute per-sample coverage and missingness by summarizing bins
per_sample_bzea <- wideseq_df %>%
  group_by(SAMPLE) %>%
  summarize(
    VARIANT_COUNT = sum(VARIANT_COUNT, na.rm = TRUE),
    INFORMATIVE_VARIANT_COUNT = sum(INFORMATIVE_VARIANT_COUNT, na.rm = TRUE),
    DEPTH_SUM = sum(DEPTH_SUM, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    lambda = DEPTH_SUM / VARIANT_COUNT,
    missing_obs = 1 - INFORMATIVE_VARIANT_COUNT / VARIANT_COUNT
  ) %>%
  select(SAMPLE, lambda, missing_obs) %>%
  filter(is.finite(lambda) & is.finite(missing_obs))

head(per_sample_bzea)

## # A tibble: 6 × 3
##   SAMPLE      lambda missing_obs
##   <chr>        <dbl>       <dbl>
## 1 PN10_SID865  0.415       0.739
## 2 PN10_SID866  1.40        0.446
## 3 PN10_SID867  0.867       0.573
## 4 PN10_SID868  0.945       0.550
## 5 PN10_SID869  0.888       0.575
## 6 PN10_SID870  0.998       0.534

1.3 Dataset 3: 50K Filtered SNPs

This dataset is derived from allelic_counts50K.tsv.

# Load the third dataset
# Make sure to replace this path with the actual path to your file
base_dir <- "/Volumes/BZea/bzeaseq/50K"
snp50K_file <- file.path(base_dir, "allelic_counts50K.tsv")
snp50K_df <- read_tsv(snp50K_file, comment = "@")

## Rows: 74815049 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): SAMPLE, CONTIG, REF_NUCLEOTIDE, ALT_NUCLEOTIDE
## dbl (3): POSITION, REF_COUNT, ALT_COUNT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Compute per-sample coverage and missingness by summarizing SNPs
per_sample_snp50K <- snp50K_df %>%
  group_by(SAMPLE) %>%
  summarize(
    VARIANT_COUNT = n(),
    INFORMATIVE_VARIANT_COUNT = sum((REF_COUNT + ALT_COUNT) > 0, na.rm = TRUE),
    DEPTH_SUM = sum(REF_COUNT + ALT_COUNT, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    lambda = DEPTH_SUM / VARIANT_COUNT,
    missing_obs = 1 - INFORMATIVE_VARIANT_COUNT / VARIANT_COUNT
  ) %>%
  select(SAMPLE, lambda, missing_obs) %>%
  filter(is.finite(lambda) & is.finite(missing_obs))

head(per_sample_snp50K)

## # A tibble: 6 × 3
##   SAMPLE      lambda missing_obs
##   <chr>        <dbl>       <dbl>
## 1 PN10_SID865  0.313       0.769
## 2 PN10_SID866  0.961       0.466
## 3 PN10_SID867  0.666       0.586
## 4 PN10_SID868  0.676       0.579
## 5 PN10_SID869  0.697       0.579
## 6 PN10_SID870  0.724       0.558

2 2. Model Fitting

Now, we apply the fit_exp_floor_model function to all three processed datasets. This function will fit the model and return the results, including the key parameters \(\pi\) and \(k\).

# Fit the model for the WGS dataset
wgs_model_results <- fit_exp_floor_model(per_sample_wgs)

# Fit the model for the BZea binned dataset
bzea_model_results <- fit_exp_floor_model(per_sample_bzea)

# Fit the model for the SNP50K dataset
snp50K_model_results <- fit_exp_floor_model(per_sample_snp50K)

# Print the fitted parameters for comparison
cat("WGS Metrics Model Parameters:\n")

## WGS Metrics Model Parameters:

cat("Floor (π):", round(wgs_model_results$params$pi, 3), "\n")

## Floor (π): 0.446

cat("Decay (k):", round(wgs_model_results$params$k, 3), "\n\n")

## Decay (k): 1.661

cat("BZea Binned Model Parameters:\n")

## BZea Binned Model Parameters:

cat("Floor (π):", round(bzea_model_results$params$pi, 3), "\n")

## Floor (π): 0.346

cat("Decay (k):", round(bzea_model_results$params$k, 3), "\n\n")

## Decay (k): 1.256

cat("SNP50K Model Parameters:\n")

## SNP50K Model Parameters:

cat("Floor (π):", round(snp50K_model_results$params$pi, 3), "\n")

## Floor (π): 0.161

cat("Decay (k):", round(snp50K_model_results$params$k, 3), "\n")

## Decay (k): 1.042

3 3. Visual Comparison of Model Fits

Finally, we generate the validation plots for all three models and display them side-by-side using cowplot::plot_grid to facilitate a direct visual comparison.

# Generate the coverage histograms for all datasets
hist_wgs <- generate_coverage_histogram(wgs_model_results$data)
hist_bzea <- generate_coverage_histogram(bzea_model_results$data)
hist_snp50K <- generate_coverage_histogram(snp50K_model_results$data)

# Generate the validation plot for the WGS dataset
plot_wgs <- generate_floor_model_plot(
  model_results = wgs_model_results,
  coverage_hist_plot = hist_wgs,
  plot_title = "Whole Genome (0.8X goal)"
)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Generate the validation plot for the BZea dataset
plot_bzea <- generate_floor_model_plot(
  model_results = bzea_model_results,
  coverage_hist_plot = hist_bzea,
  plot_title = "Wideseq (27M Teosinte SNPs)"
)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Generate the validation plot for the SNP50K dataset
plot_snp50K <- generate_floor_model_plot(
  model_results = snp50K_model_results,
  coverage_hist_plot = hist_snp50K,
  plot_title = "Filtered SNPs (50K)"
)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Arrange all three plots side-by-side
combined_plot <- plot_grid(plot_wgs, plot_bzea, plot_snp50K, ncol = 3)

# Display the combined plot
print(combined_plot)

4 4. Summary Statistics Comparison

Let’s create a summary table comparing the key statistics across all three datasets:

# Create a summary table comparing all three datasets
summary_stats <- data.frame(
  Dataset = c("WGS", "Wideseq", "SNP50K"),
  Samples = c(nrow(per_sample_wgs), nrow(per_sample_bzea), nrow(per_sample_snp50K)),
  Mean_Coverage = c(
    round(mean(per_sample_wgs$lambda), 2),
    round(mean(per_sample_bzea$lambda), 2),
    round(mean(per_sample_snp50K$lambda), 2)
  ),
  Mean_Missingness = c(
    round(mean(per_sample_wgs$missing_obs), 3),
    round(mean(per_sample_bzea$missing_obs), 3),
    round(mean(per_sample_snp50K$missing_obs), 3)
  ),
  Floor_Pi = c(
    round(wgs_model_results$params$pi, 3),
    round(bzea_model_results$params$pi, 3),
    round(snp50K_model_results$params$pi, 3)
  ),
  Decay_k = c(
    round(wgs_model_results$params$k, 3),
    round(bzea_model_results$params$k, 3),
    round(snp50K_model_results$params$k, 3)
  )
)

# Display the summary table
knitr::kable(summary_stats, 
             caption = "Comparison of Model Parameters Across Datasets",
             col.names = c("Dataset", "N Samples", "Mean Coverage", "Mean Missingness", 
                          "Floor (π)", "Decay (k)"))

Comparison of Model Parameters Across Datasets
Dataset	N Samples	Mean Coverage	Mean Missingness	Floor (π)	Decay (k)
WGS	1437	0.39	0.744	0.446	1.661
Wideseq	1434	0.59	0.670	0.346	1.256
SNP50K	1439	0.43	0.703	0.161	1.042