## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.14.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# Load the metadata CSV file into a data frame 'dat', and keep only one F1 female per family. 
dat <- read_csv("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/meta_data_223.csv") %>% 
   rename(id = sample) %>% # Rename 'sample' column to 'id'
 filter(!str_detect(name, "sis")) %>% # remove sisters and aunties, F1s without offspring
  filter(!str_detect(name, "479-2")) %>%   # in family 478, i kept 479-1
    filter(!str_detect(name, "565-2")) %>%   # in Family 564, I kept 565-1
    filter(!str_detect(name, "535_1")) %>%   # in Family 534, I kept 535-2
    filter(!str_detect(name, "535_3"))  # in Family 534, I kept 535-2
## Rows: 223 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): name, sample, dev_stage, generation, sex, pedigree
## dbl (1): family
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Read the VCF file into an object 'vcf' using the vcfR library; 'verbose = FALSE' suppresses extra output
vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE)

# Display a summary of the VCF object (basic details about the VCF data)
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## *****        *****         *****
# Rename the column names of the genotype matrix ('vcf@gt'):
colnames(vcf@gt) <- sub("_S\\d+$", "", colnames(vcf@gt))
  
# Extract the genotype (GT) and depth-related fields from the FORMAT column
gt <- extract.gt(vcf, element = "GT", as.numeric = FALSE)  # Extract Genotype (GT)
dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE)  # Extract Depth (DP)
ao <- extract.gt(vcf, element = "AO", as.numeric = TRUE)  # Extract Alternate Observations (AO)
ad <- extract.gt(vcf, element = "AD") # Extract the allele depth (AD)

Variant Allele Frequency (VAF)

# Ensure AO and DP are numeric and avoid division by zero
ao[is.na(ao)] <- 0
dp[is.na(dp)] <- 1  # Avoid division by zero

# Calculate VAF for each site and sample
vaf_df <- ao / dp

# Convert to a long format for easier visualization
vaf_long <- as.data.frame(vaf_df) %>%
  rownames_to_column(var = "site") %>%
  pivot_longer(-site, names_to = "id", values_to = "VAF")

Genotyping quality (QAF)

qa <- extract.gt(vcf, element = "QA", as.numeric = TRUE)  # Quality of Alt Allele
qo <- extract.gt(vcf, element = "QR", as.numeric = TRUE)  # Quality of Ref Allele

# Ensure missing values are handled
qa[is.na(qa)] <- 0
qo[is.na(qo)] <- 0

# Compute QAF for each site and sample
qaf <- qa / (qa + qo)

# Convert to long format for better visualization
qaf_long <- as.data.frame(qaf) %>%
  rownames_to_column(var = "site") %>%
  pivot_longer(-site, names_to = "id", values_to = "QAF")

Heterozygosity

is_het <- as.data.frame(is_het(gt)) %>% 
  rownames_to_column(var = "site") %>% 
  pivot_longer(-site, names_to = "id", values_to = "het") 

Allele depth (AD)

ad2 <- as.data.frame(ad) %>% 
  rownames_to_column(var = "site") %>% 
  pivot_longer(-site, names_to = "id", values_to = "ad") %>% 
  separate("ad", into = c("ref","alt"), sep = ",") 

ad_join <- select(dat, generation, sex, family, id = name) %>% 
  left_join(ad2) %>%  
  mutate(chrom = str_split(site, pattern = "_", simplify = TRUE)[, 2]) 
## Joining with `by = join_by(id)`
ad_join <- left_join(is_het, ad_join)
## Joining with `by = join_by(site, id)`
ad_join <- ad_join %>% mutate(ref = as.numeric(ref), alt = as.numeric(alt)) 
ad_join <- mutate(ad_join, chrom = factor(chrom, levels = unique(chrom), labels = seq_len(7)))  

Genotype (GT)

gt_long <- gt %>% 
as.data.frame() %>% 
  rownames_to_column("site") %>% 
  pivot_longer(cols = -site, names_to = "id", values_to = "GT")

Depth (DP)

dp_long <- dp %>% 
as.data.frame() %>% 
  rownames_to_column("site") %>% 
  pivot_longer(cols = -site, names_to = "id", values_to = "DP")

Join all site parameters

# Combine information on allele depth, heterozygosity, VAF, GQ and family structure 
all_param <- ad_join %>%
  left_join(qaf_long, by = c("site", "id")) %>%
  left_join(vaf_long, by = c("site", "id")) %>%
  left_join(gt_long, by = c("site", "id")) %>%
  left_join(dp_long, by = c("site", "id")) %>%
na.omit() %>%
  rename(Sample = id) %>% 
mutate(member = case_when(
    generation == "F0" & sex == "female" ~ "grandmother",
    generation == "F1" & sex == "male" ~ "father",
    generation == "F1" & sex == "female" ~ "mother",
    generation == "F2" & sex == "male" ~ "son",
    generation == "F2" & sex == "female" ~ "daughter",
    TRUE ~ "unknown"
  )) %>%
  separate(site, sep = ".1_", into = c("Chrom", "pos"), remove = FALSE) %>%  # Keep "site"
mutate(pos = as.numeric(pos))

all_param <- all_param %>% filter(!(member %in% "unknown")) %>% # remove nymphs, keep only adult mites with determined sex 
 mutate(
    GT = factor(GT, levels = c("0/0", "0/1", "1/1"), labels = c("AA", "AB", "BB")),  # Ensure "BB" exists in factor levels
    member = factor(member, levels = c("grandmother", "father", "mother", "son", "daughter"), ordered = TRUE)  # Explicit order
  ) %>%
  mutate(member_num = as.numeric(member))  # Convert ordered factor to numeric

Sanity check, to determine the threshold for site filtering

In this section, we will filter the sites based on the VAF and QAF distributions, under the assumption, that when F0 and F1 are all homozygous, then their F2 female offspring, must be homozygous as well.
As a result, we can also assume, that any genotype in the F2 female that is NOT AA, is an error.
It is therefore should be safe to set the threshold for site quality (VAF and QAF values), based on these filtered sites.

F0 and F1 Homo AA, looking at F2 daughter (should all be AA)

Find a threshold for VAF and QAF

# Find positions where all F0 and all F1 individuals are homozygous AA
AA_sites_F0F1 <- all_param %>%
  filter(generation %in% c("F0", "F1")) %>%  # Keep only F0 and F1 individuals
  group_by(family, site) %>%  # Group by family and position
  summarise(all_homo = all(GT == "AA"), .groups = "drop") %>%  # Ensure all individuals in family are AA
  filter(all_homo) %>%  # Keep only positions where all individuals in F0 and F1 are AA
  select(family, site)  # Extract family and positions

# Count the number of positions per family that passed the homo filtering
AA_sites_F0F1 %>%
  group_by(family) %>%
  summarise(num_sites_passed = n())
## # A tibble: 30 × 2
##    family num_sites_passed
##     <dbl>            <int>
##  1     43            14919
##  2     46            18552
##  3     48            16832
##  4     57            15997
##  5     63             7653
##  6    110             9645
##  7    133            11801
##  8    177            16354
##  9    187            14527
## 10    240            18495
## # ℹ 20 more rows
# Filter the F2 females at the selected positions, on chrom 1
f2_homo_parents <- all_param %>%
  filter(generation == "F2") %>%
  filter(chrom == "1") %>%
  filter(sex == "female") %>%
  # Join with AA_sites_F0F1 to keep only sites that passed the filtering
  inner_join(AA_sites_F0F1, by = c("family", "site"))

# Visualize VAF and QAF distributions to determine thresholds
# QAF: All genotypes overlaid
QAF_all_gt <- ggplot(f2_homo_parents, aes(x = QAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "A",
       x = "Quality Allele Frequency (QAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "none")

# QAF: Facetted by genotype
QAF_facet_gt <- ggplot(f2_homo_parents, aes(x = QAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  facet_wrap(~ GT, scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "B",
       x = "Quality Allele Frequency (QAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "right")

# Combine QAF plots
QAF_combined <- (QAF_all_gt + QAF_facet_gt + plot_layout(widths = c(1, 3)))

# Display the combined QAF plot
QAF_combined

# Add caption below the plot
cat("**Figure 1.** Distribution of Quality Allele Frequency (QAF) values in F2 females from crosses with homozygous F0 and F1 parents.\nA. Overlaid histogram of QAF values for all genotypes.\nB. Same data faceted by genotype (AA, AB, BB), showing the distribution and spread within each genotype class.\nVertical axis represents count of sites; horizontal axis is QAF. Bin width = 0.02.\nGenotypes are color-coded: AA (yellow), AB (green), BB (blue).")
## **Figure 1.** Distribution of Quality Allele Frequency (QAF) values in F2 females from crosses with homozygous F0 and F1 parents.
## A. Overlaid histogram of QAF values for all genotypes.
## B. Same data faceted by genotype (AA, AB, BB), showing the distribution and spread within each genotype class.
## Vertical axis represents count of sites; horizontal axis is QAF. Bin width = 0.02.
## Genotypes are color-coded: AA (yellow), AB (green), BB (blue).
# VAF: All genotypes overlaid
VAF_all_gt <- ggplot(f2_homo_parents, aes(x = VAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "A",
       x = "Variant Allele Frequency (VAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "none")

# VAF: Facetted by genotype
VAF_facet_gt <- ggplot(f2_homo_parents, aes(x = VAF, fill = GT)) +
  geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
  facet_wrap(~ GT, scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
  theme_classic() +
  labs(title = "B",
       x = "Variant Allele Frequency (VAF)",
       y = "Count") +
  scale_fill_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
    name = "Genotype"
  ) +
  theme(legend.position = "right")

# Combine VAF plots
VAF_combined <- (VAF_all_gt + VAF_facet_gt + plot_layout(widths = c(1, 3)))

# Display the combined VAF plot
VAF_combined

# Add caption below the plot
cat("**Figure 2.** Distribution of Variant Allele Frequency (VAF) values in F2 females from crosses with homozygous F0 and F1 parents.\nA. Overlaid histogram of VAF values across all genotypes.\nB. Faceted distribution of VAF by genotype, showing characteristic frequency patterns for AA, AB, and BB.\nVertical axis represents count of sites; horizontal axis is VAF. Bin width = 0.02.\nGenotypes are color-coded: AA (yellow), AB (green), BB (blue).")
## **Figure 2.** Distribution of Variant Allele Frequency (VAF) values in F2 females from crosses with homozygous F0 and F1 parents.
## A. Overlaid histogram of VAF values across all genotypes.
## B. Faceted distribution of VAF by genotype, showing characteristic frequency patterns for AA, AB, and BB.
## Vertical axis represents count of sites; horizontal axis is VAF. Bin width = 0.02.
## Genotypes are color-coded: AA (yellow), AB (green), BB (blue).

short summary:

  • 413,119 sites were homozygous in F0 and F1.
  • out of these sites, 45,841 sites were present in their corresponding families. Ofcourese, most of the sites are AA.
  • for these, we visualized the site’s quality, in each genotype.
  • because we know what the QAF and VAF should be in a perfect genotyping, we can clearly see which of the sites are errors, and set the boundries accordingly.

The plots make sense. Its visible that there’s a peak of local quality heterozygotes below 0.4, which we can safely filter at.
We therefore propose the following boundaries for each genotype:
AA < 0.03
AB > 0.4 and < 0.6
BB > 0.8

# Create the filtered sites dataset that will be used in the next Rmd file
filtered_sites <- all_param %>%
  filter(
    (GT == "AA" & QAF < 0.03 & VAF < 0.03) |   # Homozygous Reference (AA)
    (GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) |  # Heterozygous (AB)
    (GT == "BB" & QAF > 0.8 & VAF > 0.8)    # Homozygous Alternate (BB)
  ) %>%
  select(family, Sample, site, QAF, VAF, GT)  # Keep only necessary columns

# Save the filtered sites dataset for use in the next Rmd file
write.csv(filtered_sites, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/filtered_sites.csv", row.names = FALSE)

# Create summary statistics for filtered data
summary_stats <- filtered_sites %>%
  group_by(family, GT) %>%
  summarise(
    count = n(),
    mean_VAF = mean(VAF, na.rm = TRUE),
    mean_QAF = mean(QAF, na.rm = TRUE),
    .groups = 'drop'
  )

# Display summary statistics
knitr::kable(summary_stats, 
             caption = "Summary statistics for filtered sites by family and genotype",
             digits = 3)
Summary statistics for filtered sites by family and genotype
family GT count mean_VAF mean_QAF
43 AA 39939 0.000 0.000
43 AB 7464 0.495 0.501
43 BB 24072 0.984 0.997
46 AA 60493 0.000 0.000
46 AB 218 0.502 0.502
46 BB 47163 0.985 0.998
48 AA 26935 0.000 0.000
48 AB 3682 0.495 0.501
48 BB 15855 0.985 0.998
57 AA 40307 0.000 0.000
57 AB 3053 0.496 0.502
57 BB 32929 0.984 0.997
63 AA 36264 0.000 0.000
63 AB 6553 0.497 0.502
63 BB 29252 0.985 0.997
110 AA 36865 0.000 0.000
110 AB 9676 0.497 0.502
110 BB 26375 0.985 0.997
133 AA 36824 0.000 0.000
133 AB 11843 0.497 0.503
133 BB 23600 0.984 0.997
177 AA 45206 0.000 0.000
177 AB 3777 0.495 0.501
177 BB 38260 0.985 0.998
187 AA 27181 0.000 0.000
187 AB 5846 0.498 0.502
187 BB 17817 0.986 0.998
240 AA 61550 0.000 0.000
240 AB 253 0.497 0.498
240 BB 50210 0.984 0.997
266 AA 40135 0.000 0.000
266 AB 4297 0.494 0.503
266 BB 25086 0.979 0.998
284 AA 54229 0.000 0.000
284 AB 13335 0.496 0.501
284 BB 44709 0.986 0.998
300 AA 24775 0.000 0.000
300 AB 4937 0.496 0.502
300 BB 17364 0.983 0.996
302 AA 37988 0.000 0.000
302 AB 11809 0.496 0.501
302 BB 28544 0.983 0.996
322 AA 30318 0.000 0.000
322 AB 179 0.482 0.489
322 BB 15646 0.984 0.998
338 AA 51458 0.000 0.000
338 AB 15154 0.495 0.501
338 BB 31715 0.978 0.997
400 AA 60848 0.000 0.000
400 AB 191 0.493 0.493
400 BB 51339 0.982 0.996
412 AA 50585 0.000 0.000
412 AB 179 0.493 0.494
412 BB 35400 0.983 0.996
426 AA 51842 0.000 0.000
426 AB 2628 0.494 0.501
426 BB 34573 0.979 0.997
458 AA 48417 0.000 0.000
458 AB 522 0.493 0.499
458 BB 43567 0.983 0.996
476 AA 87328 0.000 0.000
476 AB 9664 0.496 0.502
476 BB 63542 0.984 0.997
478 AA 54417 0.000 0.000
478 AB 14047 0.496 0.502
478 BB 36540 0.983 0.997
498 AA 36908 0.000 0.000
498 AB 11783 0.496 0.502
498 BB 31353 0.983 0.997
502 AA 33183 0.000 0.000
502 AB 2803 0.496 0.502
502 BB 20025 0.984 0.998
520 AA 24186 0.000 0.000
520 AB 5060 0.495 0.500
520 BB 20782 0.984 0.998
534 AA 39819 0.000 0.000
534 AB 10724 0.494 0.504
534 BB 26413 0.974 0.997
548 AA 22445 0.000 0.000
548 AB 6117 0.493 0.505
548 BB 15725 0.968 0.997
564 AA 24003 0.000 0.000
564 AB 8121 0.495 0.500
564 BB 14657 0.983 0.997
596 AA 27657 0.000 0.000
596 AB 4195 0.496 0.503
596 BB 24738 0.984 0.997
600 AA 23767 0.000 0.000
600 AB 10269 0.492 0.503
600 BB 16611 0.973 0.998
# Save the f2_homo_parents data for use in the next Rmd file and create the QAF and VAF plots for the supp info of the MS
write.csv(f2_homo_parents, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/f2_homo_parents.csv", row.names = FALSE)

how many sites were kept after filtering?


Follow genotype flow in one family, on one chromosome

Now we wish to follow the genotype in specific family, in specific chromosome.
to do that, we need to look only at informative sites, for which we have hypothesis about the F2 genotype, based on their parents’ (F1) genotype.

Based on the determined site-quality threshold, we now filter the sites, and plot informative sites in one family on chromosome 1:

# Step 1: Identify informative sites: where F0 (grandmother) = 0/0 and F1 (mother) = 0/1
F0AA_F1AB <- all_param %>%
  filter(generation %in% c("F0", "F1")) %>%  # Keep only F0 and F1 individuals
   filter(chrom %in% c("1")) %>%  # Keep only chromosome 1
 group_by(family, site) %>%  # Group by family and site
  summarise(
    grandmother_homo_ref = any(GT == "AA" & generation == "F0" & sex == "female"),
    mother_heterozygous = any(GT == "AB" & generation == "F1" & sex == "female"),
    .groups = "drop"
  ) %>%
  filter(grandmother_homo_ref & mother_heterozygous) %>%  # Ensure criteria are met
  select(family, site)  # Extract only relevant columns

# Step 2: Count the number of positions per family that passed the filtering
F0AA_F1AB %>%
  group_by(family) %>%
  summarise(num_sites_passed = n())
## # A tibble: 30 × 2
##    family num_sites_passed
##     <dbl>            <int>
##  1     43               35
##  2     46               16
##  3     48               11
##  4     57               21
##  5     63              915
##  6    110               10
##  7    133              647
##  8    177               71
##  9    187              267
## 10    240               42
## # ℹ 20 more rows

Now we have data on all families, will pick one family each time to plot

# set up family and members
fam_id <- 478

one_fam <- F0AA_F1AB %>% filter(family == fam_id)

# Step 3: Extract all family members' genotypes at these sites
family_sites <- one_fam %>%
  inner_join(filtered_sites, by = c("family", "site")) %>% # Keep only sites that passed filtering
  separate(site, into = c("chrom", "pos"), sep = "\\.1_", remove = FALSE) %>% # Split site into chrom and pos
  mutate(pos = as.numeric(pos)) %>% # Convert pos to numeric
  mutate(member = case_when(
    str_detect(Sample, "fnd") ~ "grandmother",
    str_detect(Sample, "_dat") ~ "mother",
    str_detect(Sample, "_son") ~ "father",
    str_detect(Sample, "_grndat") ~ "daughter",
    str_detect(Sample, "_grnson") ~ "son",
    TRUE ~ "unknown"
  )) %>%
  mutate(member = factor(member, levels = c("grandmother", "father", "mother", "son", "daughter"), ordered = TRUE)) %>%
  mutate(member_num = as.numeric(member))

# Find positions that appear in all members
positions_all_members <- family_sites %>%
  group_by(pos) %>%
  summarise(member_count = n_distinct(member), .groups = "drop") %>%
  filter(member_count == n_distinct(family_sites$member)) %>%
  pull(pos)

# Filter f2_filtered to keep only rows with these positions
fam2_common <- family_sites %>% filter(pos %in% positions_all_members)

# Filter f2_filtered to keep only sites with depth higher then 20
#fam2_common <- fam2_common %>% filter(DP > 30)

# Ensure positions are sorted
fam2_common <- fam2_common %>% arrange(pos)

# Generate 30 evenly spaced indices and round them to ensure integer indices
indices <- round(seq(1, nrow(fam2_common), length.out = 30))

# Select the corresponding positions
selected_positions <- fam2_common %>% slice(indices) %>% pull(pos)

# Filter the dataset based on these selected positions
fam2_selected <- fam2_common %>% filter(pos %in% selected_positions) %>%  filter(!is.na(member), !is.na(pos), !is.na(GT))

vizualize genotype flow in one family, for fig A1

# Create a numeric x-axis for spacing (convert 'member' to factor and assign spacing)
fam2_selected %>%
 filter(
    (GT == "AA" & QAF < 0.03 & VAF < 0.03) |   # Homozygous Reference (AA)
    (GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) |  # Heterozygous (AB)
    (GT == "BB" & QAF > 0.8 & VAF > 0.8)    # Homozygous Alternate (BB)
  ) %>%
  ggplot(aes(x = member_num, y = pos / 1e6, color = GT)) +  # Convert pos to Mb
 geom_segment(aes(x = member_num - 0.3, xend = member_num + 0.3, 
                   y = pos / 1e6, yend = pos / 1e6), size = 1.5) +  # Apply same conversion
    scale_x_continuous(breaks = unique(fam2_selected$member_num), labels = unique(fam2_selected$member)) +  # Restore original labels
  scale_color_manual(
    values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),  # Updated colors for renamed genotypes
    name = "Genotype",
    breaks = c("AA", "AB", "BB")  # Ensures BB appears in the legend even if absent
  ) +
  theme_classic() +
  labs(
    title = paste("Genotype Flow Across Family Members 
Family ID:", fam_id),
    x = "Family Member",
    y = "Genomic 
Position (Mb)"
  ) +
  theme(
    axis.text.x = element_text(size = 12, face = "bold", angle = 30, hjust = 1),  # Rotate labels
    panel.grid.major = element_blank(), 
    axis.line.x = element_blank(),  # <-- Removes the x-axis line
    axis.ticks.x = element_blank())  # Removes x-axis ticks
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

save the filtered sites,to be used in the pulled data analysis (~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/figs-for-varroa-pedigree-MS.Rmd)

# Filter sites based on QAF & VAF thresholds while keeping sites per family
filtered_sites <- all_param %>%
  group_by(family, site, Sample) %>%  # Keep sites grouped by family
  filter(
    (GT == "AA" & QAF < 0.03 & VAF < 0.03) |   # Homozygous Reference (AA)
    (GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) |  # Heterozygous (AB)
    (GT == "BB" & QAF > 0.8 & VAF > 0.8)    # Homozygous Alternate (BB)
  ) %>%
  ungroup() %>%  # Ensure the result is ungrouped for later use
  select(family, Sample, site, QAF, VAF, GT)  # Keep family-site-sample pairs without removing duplicates

filtered_sites_summary <- filtered_sites %>%
 group_by(family, Sample) %>%  # Keep sites grouped by family
  summarise(num_sites_kept = n(), .groups = "drop") %>%  # Keep counts by Sample
  mutate(family = as.character(family), 
         num_sites_kept = as.numeric(num_sites_kept))

# Save to CSV
write.csv(filtered_sites, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/filtered_sites.csv", row.names = FALSE)
# save the fam2_selected to make the figure 1A in the figures RMD:  ~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/figs-for-varroa-pedigree-MS.Rmd
write.csv(fam2_selected, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/fam2_selected.csv", row.names = FALSE)

Archived

A plot to vizualize gene flow in one family