## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.14.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

# Load the metadata CSV file into a data frame 'dat', and keep only one F1 female per family. 
dat <- read_csv("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/meta_data_223.csv") %>% 
   rename(id = sample) %>% # Rename 'sample' column to 'id'
 filter(!str_detect(name, "sis")) %>% # remove sisters and aunties, F1s without offspring
  filter(!str_detect(name, "479-2")) %>%   # in family 478, i kept 479-1
    filter(!str_detect(name, "565-2")) %>%   # in Family 564, I kept 565-1
    filter(!str_detect(name, "535_1")) %>%   # in Family 534, I kept 535-2
    filter(!str_detect(name, "535_3"))  # in Family 534, I kept 535-2

## Rows: 223 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): name, sample, dev_stage, generation, sex, pedigree
## dbl (1): family
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Read the VCF file into an object 'vcf' using the vcfR library; 'verbose = FALSE' suppresses extra output
vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE)

# Display a summary of the VCF object (basic details about the VCF data)
vcf

## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## *****        *****         *****

# Rename the column names of the genotype matrix ('vcf@gt'):
colnames(vcf@gt) <- sub("_S\\d+$", "", colnames(vcf@gt))
  
# Extract the genotype (GT) and depth-related fields from the FORMAT column
gt <- extract.gt(vcf, element = "GT", as.numeric = FALSE)  # Extract Genotype (GT)
dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE)  # Extract Depth (DP)
ao <- extract.gt(vcf, element = "AO", as.numeric = TRUE)  # Extract Alternate Observations (AO)
ad <- extract.gt(vcf, element = "AD") # Extract the allele depth (AD)

Variant Allele Frequency (VAF)

# Ensure AO and DP are numeric and avoid division by zero
ao[is.na(ao)] <- 0
dp[is.na(dp)] <- 1  # Avoid division by zero

# Calculate VAF for each site and sample
vaf_df <- ao / dp

# Convert to a long format for easier visualization
vaf_long <- as.data.frame(vaf_df) %>%
  rownames_to_column(var = "site") %>%
  pivot_longer(-site, names_to = "id", values_to = "VAF")

Genotyping quality (QAF)

qa <- extract.gt(vcf, element = "QA", as.numeric = TRUE)  # Quality of Alt Allele
qo <- extract.gt(vcf, element = "QR", as.numeric = TRUE)  # Quality of Ref Allele

# Ensure missing values are handled
qa[is.na(qa)] <- 0
qo[is.na(qo)] <- 0

# Compute QAF for each site and sample
qaf <- qa / (qa + qo)

# Convert to long format for better visualization
qaf_long <- as.data.frame(qaf) %>%
  rownames_to_column(var = "site") %>%
  pivot_longer(-site, names_to = "id", values_to = "QAF")

Heterozygosity

is_het <- as.data.frame(is_het(gt)) %>% 
  rownames_to_column(var = "site") %>% 
  pivot_longer(-site, names_to = "id", values_to = "het")

Allele depth (AD)

ad2 <- as.data.frame(ad) %>% 
  rownames_to_column(var = "site") %>% 
  pivot_longer(-site, names_to = "id", values_to = "ad") %>% 
  separate("ad", into = c("ref","alt"), sep = ",") 

ad_join <- select(dat, generation, sex, family, id = name) %>% 
  left_join(ad2) %>%  
  mutate(chrom = str_split(site, pattern = "_", simplify = TRUE)[, 2])

## Joining with `by = join_by(id)`

ad_join <- left_join(is_het, ad_join)

## Joining with `by = join_by(site, id)`

ad_join <- ad_join %>% mutate(ref = as.numeric(ref), alt = as.numeric(alt)) 
ad_join <- mutate(ad_join, chrom = factor(chrom, levels = unique(chrom), labels = seq_len(7)))

Genotype (GT)

gt_long <- gt %>% 
as.data.frame() %>% 
  rownames_to_column("site") %>% 
  pivot_longer(cols = -site, names_to = "id", values_to = "GT")

Depth (DP)

dp_long <- dp %>% 
as.data.frame() %>% 
  rownames_to_column("site") %>% 
  pivot_longer(cols = -site, names_to = "id", values_to = "DP")

Join all site parameters

# Combine information on allele depth, heterozygosity, VAF, GQ and family structure 
all_param <- ad_join %>%
  left_join(qaf_long, by = c("site", "id")) %>%
  left_join(vaf_long, by = c("site", "id")) %>%
  left_join(gt_long, by = c("site", "id")) %>%
  left_join(dp_long, by = c("site", "id")) %>%
na.omit() %>%
  rename(Sample = id) %>% 
mutate(member = case_when(
    generation == "F0" & sex == "female" ~ "grandmother",
    generation == "F1" & sex == "male" ~ "father",
    generation == "F1" & sex == "female" ~ "mother",
    generation == "F2" & sex == "male" ~ "son",
    generation == "F2" & sex == "female" ~ "daughter",
    TRUE ~ "unknown"
  )) %>%
  separate(site, sep = ".1_", into = c("Chrom", "pos"), remove = FALSE) %>%  # Keep "site"
mutate(pos = as.numeric(pos))

all_param <- all_param %>% filter(!(member %in% "unknown")) %>% # remove nymphs, keep only adult mites with determined sex 
 mutate(
    GT = factor(GT, levels = c("0/0", "0/1", "1/1"), labels = c("AA", "AB", "BB")),  # Ensure "BB" exists in factor levels
    member = factor(member, levels = c("grandmother", "father", "mother", "son", "daughter"), ordered = TRUE)  # Explicit order
  ) %>%
  mutate(member_num = as.numeric(member))  # Convert ordered factor to numeric

Sanity check, to determine the threshold for site filtering

In this section, we will filter the sites based on the VAF and QAF distributions, under the assumption, that when F0 and F1 are all homozygous, then their F2 female offspring, must be homozygous as well.
As a result, we can also assume, that any genotype in the F2 female that is NOT AA, is an error.
It is therefore should be safe to set the threshold for site quality (VAF and QAF values), based on these filtered sites.

F0 and F1 Homo AA, looking at F2 daughter (should all be AA)

Find a threshold for VAF and QAF

# Find positions where all F0 and all F1 individuals are homozygous AA
AA_sites_F0F1 <- all_param %>%
  filter(generation %in% c("F0", "F1")) %>%  # Keep only F0 and F1 individuals
  group_by(family, site) %>%  # Group by family and position
  summarise(all_homo = all(GT == "AA"), .groups = "drop") %>%  # Ensure all individuals in family are AA
  filter(all_homo) %>%  # Keep only positions where all individuals in F0 and F1 are AA
  select(family, site)  # Extract family and positions

# Count the number of positions per family that passed the homo filtering
AA_sites_F0F1 %>%
  group_by(family) %>%
  summarise(num_sites_passed = n())

## # A tibble: 30 × 2
##    family num_sites_passed
##     <dbl>            <int>
##  1     43            14919
##  2     46            18552
##  3     48            16832
##  4     57            15997
##  5     63             7653
##  6    110             9645
##  7    133            11801
##  8    177            16354
##  9    187            14527
## 10    240            18495
## # ℹ 20 more rows

# Filter the F2 females at the selected positions, on chrom 1
f2_homo_parents <- all_param %>%
  filter(generation == "F2") %>%
  filter(chrom == "1") %>%
  filter(sex == "female") %>%
  # Join with AA_sites_F0F1 to keep only sites that passed the filtering
  inner_join(AA_sites_F0F1, by = c("family", "site"))

# Visualize VAF and QAF distributions to determine thresholds
# QAF: All genotypes overlaid
QAF_all_gt <- ggplot(f2_homo_parents, aes(x = QAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "A",
       x = "Quality Allele Frequency (QAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "none")

# QAF: Facetted by genotype
QAF_facet_gt <- ggplot(f2_homo_parents, aes(x = QAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  facet_wrap(~ GT, scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "B",
       x = "Quality Allele Frequency (QAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "right")

# Combine QAF plots
QAF_combined <- (QAF_all_gt + QAF_facet_gt + plot_layout(widths = c(1, 3)))

# Display the combined QAF plot
QAF_combined

# Add caption below the plot
cat("**Figure 1.** Distribution of Quality Allele Frequency (QAF) values in F2 females from crosses with homozygous F0 and F1 parents.\nA. Overlaid histogram of QAF values for all genotypes.\nB. Same data faceted by genotype (AA, AB, BB), showing the distribution and spread within each genotype class.\nVertical axis represents count of sites; horizontal axis is QAF. Bin width = 0.02.\nGenotypes are color-coded: AA (yellow), AB (green), BB (blue).")

## **Figure 1.** Distribution of Quality Allele Frequency (QAF) values in F2 females from crosses with homozygous F0 and F1 parents.
## A. Overlaid histogram of QAF values for all genotypes.
## B. Same data faceted by genotype (AA, AB, BB), showing the distribution and spread within each genotype class.
## Vertical axis represents count of sites; horizontal axis is QAF. Bin width = 0.02.
## Genotypes are color-coded: AA (yellow), AB (green), BB (blue).

# VAF: All genotypes overlaid
VAF_all_gt <- ggplot(f2_homo_parents, aes(x = VAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "A",
       x = "Variant Allele Frequency (VAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "none")

# VAF: Facetted by genotype
VAF_facet_gt <- ggplot(f2_homo_parents, aes(x = VAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  facet_wrap(~ GT, scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "B",
       x = "Variant Allele Frequency (VAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "right")

# Combine VAF plots
VAF_combined <- (VAF_all_gt + VAF_facet_gt + plot_layout(widths = c(1, 3)))

# Display the combined VAF plot
VAF_combined

# Add caption below the plot
cat("**Figure 2.** Distribution of Variant Allele Frequency (VAF) values in F2 females from crosses with homozygous F0 and F1 parents.\nA. Overlaid histogram of VAF values across all genotypes.\nB. Faceted distribution of VAF by genotype, showing characteristic frequency patterns for AA, AB, and BB.\nVertical axis represents count of sites; horizontal axis is VAF. Bin width = 0.02.\nGenotypes are color-coded: AA (yellow), AB (green), BB (blue).")

## **Figure 2.** Distribution of Variant Allele Frequency (VAF) values in F2 females from crosses with homozygous F0 and F1 parents.
## A. Overlaid histogram of VAF values across all genotypes.
## B. Faceted distribution of VAF by genotype, showing characteristic frequency patterns for AA, AB, and BB.
## Vertical axis represents count of sites; horizontal axis is VAF. Bin width = 0.02.
## Genotypes are color-coded: AA (yellow), AB (green), BB (blue).

short summary:

413,119 sites were homozygous in F0 and F1.
out of these sites, 45,841 sites were present in their corresponding families. Ofcourese, most of the sites are AA.
for these, we visualized the site’s quality, in each genotype.
because we know what the QAF and VAF should be in a perfect genotyping, we can clearly see which of the sites are errors, and set the boundries accordingly.

The plots make sense. Its visible that there’s a peak of local quality heterozygotes below 0.4, which we can safely filter at.
We therefore propose the following boundaries for each genotype:
AA < 0.03
AB > 0.4 and < 0.6
BB > 0.8

# Create the filtered sites dataset that will be used in the next Rmd file
filtered_sites <- all_param %>%
  filter(
    (GT == "AA" & QAF < 0.03 & VAF < 0.03) |   # Homozygous Reference (AA)
    (GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) |  # Heterozygous (AB)
    (GT == "BB" & QAF > 0.8 & VAF > 0.8)    # Homozygous Alternate (BB)
  ) %>%
  select(family, Sample, site, QAF, VAF, GT)  # Keep only necessary columns

# Save the filtered sites dataset for use in the next Rmd file
write.csv(filtered_sites, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/filtered_sites.csv", row.names = FALSE)

# Create summary statistics for filtered data
summary_stats <- filtered_sites %>%
  group_by(family, GT) %>%
  summarise(
    count = n(),
    mean_VAF = mean(VAF, na.rm = TRUE),
    mean_QAF = mean(QAF, na.rm = TRUE),
    .groups = 'drop'
  )

# Display summary statistics
knitr::kable(summary_stats, 
             caption = "Summary statistics for filtered sites by family and genotype",
             digits = 3)

Summary statistics for filtered sites by family and genotype
family	GT	count	mean_VAF	mean_QAF
43	AA	39939	0.000	0.000
43	AB	7464	0.495	0.501
43	BB	24072	0.984	0.997
46	AA	60493	0.000	0.000
46	AB	218	0.502	0.502
46	BB	47163	0.985	0.998
48	AA	26935	0.000	0.000
48	AB	3682	0.495	0.501
48	BB	15855	0.985	0.998
57	AA	40307	0.000	0.000
57	AB	3053	0.496	0.502
57	BB	32929	0.984	0.997
63	AA	36264	0.000	0.000
63	AB	6553	0.497	0.502
63	BB	29252	0.985	0.997
110	AA	36865	0.000	0.000
110	AB	9676	0.497	0.502
110	BB	26375	0.985	0.997
133	AA	36824	0.000	0.000
133	AB	11843	0.497	0.503
133	BB	23600	0.984	0.997
177	AA	45206	0.000	0.000
177	AB	3777	0.495	0.501
177	BB	38260	0.985	0.998
187	AA	27181	0.000	0.000
187	AB	5846	0.498	0.502
187	BB	17817	0.986	0.998
240	AA	61550	0.000	0.000
240	AB	253	0.497	0.498
240	BB	50210	0.984	0.997
266	AA	40135	0.000	0.000
266	AB	4297	0.494	0.503
266	BB	25086	0.979	0.998
284	AA	54229	0.000	0.000
284	AB	13335	0.496	0.501
284	BB	44709	0.986	0.998
300	AA	24775	0.000	0.000
300	AB	4937	0.496	0.502
300	BB	17364	0.983	0.996
302	AA	37988	0.000	0.000
302	AB	11809	0.496	0.501
302	BB	28544	0.983	0.996
322	AA	30318	0.000	0.000
322	AB	179	0.482	0.489
322	BB	15646	0.984	0.998
338	AA	51458	0.000	0.000
338	AB	15154	0.495	0.501
338	BB	31715	0.978	0.997
400	AA	60848	0.000	0.000
400	AB	191	0.493	0.493
400	BB	51339	0.982	0.996
412	AA	50585	0.000	0.000
412	AB	179	0.493	0.494
412	BB	35400	0.983	0.996
426	AA	51842	0.000	0.000
426	AB	2628	0.494	0.501
426	BB	34573	0.979	0.997
458	AA	48417	0.000	0.000
458	AB	522	0.493	0.499
458	BB	43567	0.983	0.996
476	AA	87328	0.000	0.000
476	AB	9664	0.496	0.502
476	BB	63542	0.984	0.997
478	AA	54417	0.000	0.000
478	AB	14047	0.496	0.502
478	BB	36540	0.983	0.997
498	AA	36908	0.000	0.000
498	AB	11783	0.496	0.502
498	BB	31353	0.983	0.997
502	AA	33183	0.000	0.000
502	AB	2803	0.496	0.502
502	BB	20025	0.984	0.998
520	AA	24186	0.000	0.000
520	AB	5060	0.495	0.500
520	BB	20782	0.984	0.998
534	AA	39819	0.000	0.000
534	AB	10724	0.494	0.504
534	BB	26413	0.974	0.997
548	AA	22445	0.000	0.000
548	AB	6117	0.493	0.505
548	BB	15725	0.968	0.997
564	AA	24003	0.000	0.000
564	AB	8121	0.495	0.500
564	BB	14657	0.983	0.997
596	AA	27657	0.000	0.000
596	AB	4195	0.496	0.503
596	BB	24738	0.984	0.997
600	AA	23767	0.000	0.000
600	AB	10269	0.492	0.503
600	BB	16611	0.973	0.998

# Save the f2_homo_parents data for use in the next Rmd file and create the QAF and VAF plots for the supp info of the MS
write.csv(f2_homo_parents, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/f2_homo_parents.csv", row.names = FALSE)

how many sites were kept after filtering?

Follow genotype flow in one family, on one chromosome

Now we wish to follow the genotype in specific family, in specific chromosome.
to do that, we need to look only at informative sites, for which we have hypothesis about the F2 genotype, based on their parents’ (F1) genotype.

Based on the determined site-quality threshold, we now filter the sites, and plot informative sites in one family on chromosome 1:

# Step 1: Identify informative sites: where F0 (grandmother) = 0/0 and F1 (mother) = 0/1
F0AA_F1AB <- all_param %>%
  filter(generation %in% c("F0", "F1")) %>%  # Keep only F0 and F1 individuals
   filter(chrom %in% c("1")) %>%  # Keep only chromosome 1
 group_by(family, site) %>%  # Group by family and site
  summarise(
    grandmother_homo_ref = any(GT == "AA" & generation == "F0" & sex == "female"),
    mother_heterozygous = any(GT == "AB" & generation == "F1" & sex == "female"),
    .groups = "drop"
  ) %>%
  filter(grandmother_homo_ref & mother_heterozygous) %>%  # Ensure criteria are met
  select(family, site)  # Extract only relevant columns

# Step 2: Count the number of positions per family that passed the filtering
F0AA_F1AB %>%
  group_by(family) %>%
  summarise(num_sites_passed = n())

## # A tibble: 30 × 2
##    family num_sites_passed
##     <dbl>            <int>
##  1     43               35
##  2     46               16
##  3     48               11
##  4     57               21
##  5     63              915
##  6    110               10
##  7    133              647
##  8    177               71
##  9    187              267
## 10    240               42
## # ℹ 20 more rows

Now we have data on all families, will pick one family each time to plot

# set up family and members
fam_id <- 478

one_fam <- F0AA_F1AB %>% filter(family == fam_id)

# Step 3: Extract all family members' genotypes at these sites
family_sites <- one_fam %>%
  inner_join(filtered_sites, by = c("family", "site")) %>% # Keep only sites that passed filtering
  separate(site, into = c("chrom", "pos"), sep = "\\.1_", remove = FALSE) %>% # Split site into chrom and pos
  mutate(pos = as.numeric(pos)) %>% # Convert pos to numeric
  mutate(member = case_when(
    str_detect(Sample, "fnd") ~ "grandmother",
    str_detect(Sample, "_dat") ~ "mother",
    str_detect(Sample, "_son") ~ "father",
    str_detect(Sample, "_grndat") ~ "daughter",
    str_detect(Sample, "_grnson") ~ "son",
    TRUE ~ "unknown"
  )) %>%
  mutate(member = factor(member, levels = c("grandmother", "father", "mother", "son", "daughter"), ordered = TRUE)) %>%
  mutate(member_num = as.numeric(member))

# Find positions that appear in all members
positions_all_members <- family_sites %>%
  group_by(pos) %>%
  summarise(member_count = n_distinct(member), .groups = "drop") %>%
  filter(member_count == n_distinct(family_sites$member)) %>%
  pull(pos)

# Filter f2_filtered to keep only rows with these positions
fam2_common <- family_sites %>% filter(pos %in% positions_all_members)

# Filter f2_filtered to keep only sites with depth higher then 20
#fam2_common <- fam2_common %>% filter(DP > 30)

# Ensure positions are sorted
fam2_common <- fam2_common %>% arrange(pos)

# Generate 30 evenly spaced indices and round them to ensure integer indices
indices <- round(seq(1, nrow(fam2_common), length.out = 30))

# Select the corresponding positions
selected_positions <- fam2_common %>% slice(indices) %>% pull(pos)

# Filter the dataset based on these selected positions
fam2_selected <- fam2_common %>% filter(pos %in% selected_positions) %>%  filter(!is.na(member), !is.na(pos), !is.na(GT))

vizualize genotype flow in one family, for fig A1

# Create a numeric x-axis for spacing (convert 'member' to factor and assign spacing)
fam2_selected %>%
 filter(
    (GT == "AA" & QAF < 0.03 & VAF < 0.03) |   # Homozygous Reference (AA)
    (GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) |  # Heterozygous (AB)
    (GT == "BB" & QAF > 0.8 & VAF > 0.8)    # Homozygous Alternate (BB)
  ) %>%
  ggplot(aes(x = member_num, y = pos / 1e6, color = GT)) +  # Convert pos to Mb
 geom_segment(aes(x = member_num - 0.3, xend = member_num + 0.3, 
                   y = pos / 1e6, yend = pos / 1e6), size = 1.5) +  # Apply same conversion
    scale_x_continuous(breaks = unique(fam2_selected$member_num), labels = unique(fam2_selected$member)) +  # Restore original labels
  scale_color_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),  # Updated colors for renamed genotypes
    name = "Genotype",
    breaks = c("AA", "AB", "BB")  # Ensures BB appears in the legend even if absent
  ) +
  theme_classic() +
  labs(
    title = paste("Genotype Flow Across Family Members 
Family ID:", fam_id),
    x = "Family Member",
    y = "Genomic 
Position (Mb)"
  ) +
  theme(
    axis.text.x = element_text(size = 12, face = "bold", angle = 30, hjust = 1),  # Rotate labels
    panel.grid.major = element_blank(), 
    axis.line.x = element_blank(),  # <-- Removes the x-axis line
    axis.ticks.x = element_blank())  # Removes x-axis ticks

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

save the filtered sites,to be used in the pulled data analysis (~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/figs-for-varroa-pedigree-MS.Rmd)

# Filter sites based on QAF & VAF thresholds while keeping sites per family
filtered_sites <- all_param %>%
  group_by(family, site, Sample) %>%  # Keep sites grouped by family
  filter(
    (GT == "AA" & QAF < 0.03 & VAF < 0.03) |   # Homozygous Reference (AA)
    (GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) |  # Heterozygous (AB)
    (GT == "BB" & QAF > 0.8 & VAF > 0.8)    # Homozygous Alternate (BB)
  ) %>%
  ungroup() %>%  # Ensure the result is ungrouped for later use
  select(family, Sample, site, QAF, VAF, GT)  # Keep family-site-sample pairs without removing duplicates

filtered_sites_summary <- filtered_sites %>%
 group_by(family, Sample) %>%  # Keep sites grouped by family
  summarise(num_sites_kept = n(), .groups = "drop") %>%  # Keep counts by Sample
  mutate(family = as.character(family), 
         num_sites_kept = as.numeric(num_sites_kept))

# Save to CSV
write.csv(filtered_sites, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/filtered_sites.csv", row.names = FALSE)

# save the fam2_selected to make the figure 1A in the figures RMD:  ~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/figs-for-varroa-pedigree-MS.Rmd
write.csv(fam2_selected, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/fam2_selected.csv", row.names = FALSE)

site quality exploration