##
## ***** *** vcfR *** *****
## This is vcfR 1.14.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Load the metadata CSV file into a data frame 'dat', and keep only one F1 female per family.
dat <- read_csv("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/meta_data_223.csv") %>%
rename(id = sample) %>% # Rename 'sample' column to 'id'
filter(!str_detect(name, "sis")) %>% # remove sisters and aunties, F1s without offspring
filter(!str_detect(name, "479-2")) %>% # in family 478, i kept 479-1
filter(!str_detect(name, "565-2")) %>% # in Family 564, I kept 565-1
filter(!str_detect(name, "535_1")) %>% # in Family 534, I kept 535-2
filter(!str_detect(name, "535_3")) # in Family 534, I kept 535-2
## Rows: 223 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): name, sample, dev_stage, generation, sex, pedigree
## dbl (1): family
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Read the VCF file into an object 'vcf' using the vcfR library; 'verbose = FALSE' suppresses extra output
vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE)
# Display a summary of the VCF object (basic details about the VCF data)
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## ***** ***** *****
# Rename the column names of the genotype matrix ('vcf@gt'):
colnames(vcf@gt) <- sub("_S\\d+$", "", colnames(vcf@gt))
# Extract the genotype (GT) and depth-related fields from the FORMAT column
gt <- extract.gt(vcf, element = "GT", as.numeric = FALSE) # Extract Genotype (GT)
dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE) # Extract Depth (DP)
ao <- extract.gt(vcf, element = "AO", as.numeric = TRUE) # Extract Alternate Observations (AO)
ad <- extract.gt(vcf, element = "AD") # Extract the allele depth (AD)
# Ensure AO and DP are numeric and avoid division by zero
ao[is.na(ao)] <- 0
dp[is.na(dp)] <- 1 # Avoid division by zero
# Calculate VAF for each site and sample
vaf_df <- ao / dp
# Convert to a long format for easier visualization
vaf_long <- as.data.frame(vaf_df) %>%
rownames_to_column(var = "site") %>%
pivot_longer(-site, names_to = "id", values_to = "VAF")
qa <- extract.gt(vcf, element = "QA", as.numeric = TRUE) # Quality of Alt Allele
qo <- extract.gt(vcf, element = "QR", as.numeric = TRUE) # Quality of Ref Allele
# Ensure missing values are handled
qa[is.na(qa)] <- 0
qo[is.na(qo)] <- 0
# Compute QAF for each site and sample
qaf <- qa / (qa + qo)
# Convert to long format for better visualization
qaf_long <- as.data.frame(qaf) %>%
rownames_to_column(var = "site") %>%
pivot_longer(-site, names_to = "id", values_to = "QAF")
is_het <- as.data.frame(is_het(gt)) %>%
rownames_to_column(var = "site") %>%
pivot_longer(-site, names_to = "id", values_to = "het")
ad2 <- as.data.frame(ad) %>%
rownames_to_column(var = "site") %>%
pivot_longer(-site, names_to = "id", values_to = "ad") %>%
separate("ad", into = c("ref","alt"), sep = ",")
ad_join <- select(dat, generation, sex, family, id = name) %>%
left_join(ad2) %>%
mutate(chrom = str_split(site, pattern = "_", simplify = TRUE)[, 2])
## Joining with `by = join_by(id)`
ad_join <- left_join(is_het, ad_join)
## Joining with `by = join_by(site, id)`
ad_join <- ad_join %>% mutate(ref = as.numeric(ref), alt = as.numeric(alt))
ad_join <- mutate(ad_join, chrom = factor(chrom, levels = unique(chrom), labels = seq_len(7)))
gt_long <- gt %>%
as.data.frame() %>%
rownames_to_column("site") %>%
pivot_longer(cols = -site, names_to = "id", values_to = "GT")
dp_long <- dp %>%
as.data.frame() %>%
rownames_to_column("site") %>%
pivot_longer(cols = -site, names_to = "id", values_to = "DP")
# Combine information on allele depth, heterozygosity, VAF, GQ and family structure
all_param <- ad_join %>%
left_join(qaf_long, by = c("site", "id")) %>%
left_join(vaf_long, by = c("site", "id")) %>%
left_join(gt_long, by = c("site", "id")) %>%
left_join(dp_long, by = c("site", "id")) %>%
na.omit() %>%
rename(Sample = id) %>%
mutate(member = case_when(
generation == "F0" & sex == "female" ~ "grandmother",
generation == "F1" & sex == "male" ~ "father",
generation == "F1" & sex == "female" ~ "mother",
generation == "F2" & sex == "male" ~ "son",
generation == "F2" & sex == "female" ~ "daughter",
TRUE ~ "unknown"
)) %>%
separate(site, sep = ".1_", into = c("Chrom", "pos"), remove = FALSE) %>% # Keep "site"
mutate(pos = as.numeric(pos))
all_param <- all_param %>% filter(!(member %in% "unknown")) %>% # remove nymphs, keep only adult mites with determined sex
mutate(
GT = factor(GT, levels = c("0/0", "0/1", "1/1"), labels = c("AA", "AB", "BB")), # Ensure "BB" exists in factor levels
member = factor(member, levels = c("grandmother", "father", "mother", "son", "daughter"), ordered = TRUE) # Explicit order
) %>%
mutate(member_num = as.numeric(member)) # Convert ordered factor to numeric
In this section, we will filter the sites based on the VAF and QAF
distributions, under the assumption, that when F0 and F1 are all
homozygous, then their F2 female offspring, must be homozygous as
well.
As a result, we can also assume, that any genotype in the F2 female that
is NOT AA, is an error.
It is therefore should be safe to set the threshold for site quality
(VAF and QAF values), based on these filtered sites.
# Find positions where all F0 and all F1 individuals are homozygous AA
AA_sites_F0F1 <- all_param %>%
filter(generation %in% c("F0", "F1")) %>% # Keep only F0 and F1 individuals
group_by(family, site) %>% # Group by family and position
summarise(all_homo = all(GT == "AA"), .groups = "drop") %>% # Ensure all individuals in family are AA
filter(all_homo) %>% # Keep only positions where all individuals in F0 and F1 are AA
select(family, site) # Extract family and positions
# Count the number of positions per family that passed the homo filtering
AA_sites_F0F1 %>%
group_by(family) %>%
summarise(num_sites_passed = n())
## # A tibble: 30 × 2
## family num_sites_passed
## <dbl> <int>
## 1 43 14919
## 2 46 18552
## 3 48 16832
## 4 57 15997
## 5 63 7653
## 6 110 9645
## 7 133 11801
## 8 177 16354
## 9 187 14527
## 10 240 18495
## # ℹ 20 more rows
# Filter the F2 females at the selected positions, on chrom 1
f2_homo_parents <- all_param %>%
filter(generation == "F2") %>%
filter(chrom == "1") %>%
filter(sex == "female") %>%
# Join with AA_sites_F0F1 to keep only sites that passed the filtering
inner_join(AA_sites_F0F1, by = c("family", "site"))
# Visualize VAF and QAF distributions to determine thresholds
# QAF: All genotypes overlaid
QAF_all_gt <- ggplot(f2_homo_parents, aes(x = QAF, fill = GT)) +
geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
theme_classic() +
labs(title = "A",
x = "Quality Allele Frequency (QAF)",
y = "Count") +
scale_fill_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
name = "Genotype"
) +
theme(legend.position = "none")
# QAF: Facetted by genotype
QAF_facet_gt <- ggplot(f2_homo_parents, aes(x = QAF, fill = GT)) +
geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
facet_wrap(~ GT, scales = "free_y") +
scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
theme_classic() +
labs(title = "B",
x = "Quality Allele Frequency (QAF)",
y = "Count") +
scale_fill_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
name = "Genotype"
) +
theme(legend.position = "right")
# Combine QAF plots
QAF_combined <- (QAF_all_gt + QAF_facet_gt + plot_layout(widths = c(1, 3)))
# Display the combined QAF plot
QAF_combined
# Add caption below the plot
cat("**Figure 1.** Distribution of Quality Allele Frequency (QAF) values in F2 females from crosses with homozygous F0 and F1 parents.\nA. Overlaid histogram of QAF values for all genotypes.\nB. Same data faceted by genotype (AA, AB, BB), showing the distribution and spread within each genotype class.\nVertical axis represents count of sites; horizontal axis is QAF. Bin width = 0.02.\nGenotypes are color-coded: AA (yellow), AB (green), BB (blue).")
## **Figure 1.** Distribution of Quality Allele Frequency (QAF) values in F2 females from crosses with homozygous F0 and F1 parents.
## A. Overlaid histogram of QAF values for all genotypes.
## B. Same data faceted by genotype (AA, AB, BB), showing the distribution and spread within each genotype class.
## Vertical axis represents count of sites; horizontal axis is QAF. Bin width = 0.02.
## Genotypes are color-coded: AA (yellow), AB (green), BB (blue).
# VAF: All genotypes overlaid
VAF_all_gt <- ggplot(f2_homo_parents, aes(x = VAF, fill = GT)) +
geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
theme_classic() +
labs(title = "A",
x = "Variant Allele Frequency (VAF)",
y = "Count") +
scale_fill_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
name = "Genotype"
) +
theme(legend.position = "none")
# VAF: Facetted by genotype
VAF_facet_gt <- ggplot(f2_homo_parents, aes(x = VAF, fill = GT)) +
geom_histogram(binwidth = 0.02, alpha = 1, position = "identity") +
facet_wrap(~ GT, scales = "free_y") +
scale_x_continuous(breaks = seq(0, 1, by = 0.1)) +
theme_classic() +
labs(title = "B",
x = "Variant Allele Frequency (VAF)",
y = "Count") +
scale_fill_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"),
name = "Genotype"
) +
theme(legend.position = "right")
# Combine VAF plots
VAF_combined <- (VAF_all_gt + VAF_facet_gt + plot_layout(widths = c(1, 3)))
# Display the combined VAF plot
VAF_combined
# Add caption below the plot
cat("**Figure 2.** Distribution of Variant Allele Frequency (VAF) values in F2 females from crosses with homozygous F0 and F1 parents.\nA. Overlaid histogram of VAF values across all genotypes.\nB. Faceted distribution of VAF by genotype, showing characteristic frequency patterns for AA, AB, and BB.\nVertical axis represents count of sites; horizontal axis is VAF. Bin width = 0.02.\nGenotypes are color-coded: AA (yellow), AB (green), BB (blue).")
## **Figure 2.** Distribution of Variant Allele Frequency (VAF) values in F2 females from crosses with homozygous F0 and F1 parents.
## A. Overlaid histogram of VAF values across all genotypes.
## B. Faceted distribution of VAF by genotype, showing characteristic frequency patterns for AA, AB, and BB.
## Vertical axis represents count of sites; horizontal axis is VAF. Bin width = 0.02.
## Genotypes are color-coded: AA (yellow), AB (green), BB (blue).
The plots make sense. Its visible that there’s a peak of local
quality heterozygotes below 0.4, which we can safely filter at.
We therefore propose the following boundaries for each genotype:
AA < 0.03
AB > 0.4 and < 0.6
BB > 0.8
# Create the filtered sites dataset that will be used in the next Rmd file
filtered_sites <- all_param %>%
filter(
(GT == "AA" & QAF < 0.03 & VAF < 0.03) | # Homozygous Reference (AA)
(GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) | # Heterozygous (AB)
(GT == "BB" & QAF > 0.8 & VAF > 0.8) # Homozygous Alternate (BB)
) %>%
select(family, Sample, site, QAF, VAF, GT) # Keep only necessary columns
# Save the filtered sites dataset for use in the next Rmd file
write.csv(filtered_sites, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/filtered_sites.csv", row.names = FALSE)
# Create summary statistics for filtered data
summary_stats <- filtered_sites %>%
group_by(family, GT) %>%
summarise(
count = n(),
mean_VAF = mean(VAF, na.rm = TRUE),
mean_QAF = mean(QAF, na.rm = TRUE),
.groups = 'drop'
)
# Display summary statistics
knitr::kable(summary_stats,
caption = "Summary statistics for filtered sites by family and genotype",
digits = 3)
| family | GT | count | mean_VAF | mean_QAF |
|---|---|---|---|---|
| 43 | AA | 39939 | 0.000 | 0.000 |
| 43 | AB | 7464 | 0.495 | 0.501 |
| 43 | BB | 24072 | 0.984 | 0.997 |
| 46 | AA | 60493 | 0.000 | 0.000 |
| 46 | AB | 218 | 0.502 | 0.502 |
| 46 | BB | 47163 | 0.985 | 0.998 |
| 48 | AA | 26935 | 0.000 | 0.000 |
| 48 | AB | 3682 | 0.495 | 0.501 |
| 48 | BB | 15855 | 0.985 | 0.998 |
| 57 | AA | 40307 | 0.000 | 0.000 |
| 57 | AB | 3053 | 0.496 | 0.502 |
| 57 | BB | 32929 | 0.984 | 0.997 |
| 63 | AA | 36264 | 0.000 | 0.000 |
| 63 | AB | 6553 | 0.497 | 0.502 |
| 63 | BB | 29252 | 0.985 | 0.997 |
| 110 | AA | 36865 | 0.000 | 0.000 |
| 110 | AB | 9676 | 0.497 | 0.502 |
| 110 | BB | 26375 | 0.985 | 0.997 |
| 133 | AA | 36824 | 0.000 | 0.000 |
| 133 | AB | 11843 | 0.497 | 0.503 |
| 133 | BB | 23600 | 0.984 | 0.997 |
| 177 | AA | 45206 | 0.000 | 0.000 |
| 177 | AB | 3777 | 0.495 | 0.501 |
| 177 | BB | 38260 | 0.985 | 0.998 |
| 187 | AA | 27181 | 0.000 | 0.000 |
| 187 | AB | 5846 | 0.498 | 0.502 |
| 187 | BB | 17817 | 0.986 | 0.998 |
| 240 | AA | 61550 | 0.000 | 0.000 |
| 240 | AB | 253 | 0.497 | 0.498 |
| 240 | BB | 50210 | 0.984 | 0.997 |
| 266 | AA | 40135 | 0.000 | 0.000 |
| 266 | AB | 4297 | 0.494 | 0.503 |
| 266 | BB | 25086 | 0.979 | 0.998 |
| 284 | AA | 54229 | 0.000 | 0.000 |
| 284 | AB | 13335 | 0.496 | 0.501 |
| 284 | BB | 44709 | 0.986 | 0.998 |
| 300 | AA | 24775 | 0.000 | 0.000 |
| 300 | AB | 4937 | 0.496 | 0.502 |
| 300 | BB | 17364 | 0.983 | 0.996 |
| 302 | AA | 37988 | 0.000 | 0.000 |
| 302 | AB | 11809 | 0.496 | 0.501 |
| 302 | BB | 28544 | 0.983 | 0.996 |
| 322 | AA | 30318 | 0.000 | 0.000 |
| 322 | AB | 179 | 0.482 | 0.489 |
| 322 | BB | 15646 | 0.984 | 0.998 |
| 338 | AA | 51458 | 0.000 | 0.000 |
| 338 | AB | 15154 | 0.495 | 0.501 |
| 338 | BB | 31715 | 0.978 | 0.997 |
| 400 | AA | 60848 | 0.000 | 0.000 |
| 400 | AB | 191 | 0.493 | 0.493 |
| 400 | BB | 51339 | 0.982 | 0.996 |
| 412 | AA | 50585 | 0.000 | 0.000 |
| 412 | AB | 179 | 0.493 | 0.494 |
| 412 | BB | 35400 | 0.983 | 0.996 |
| 426 | AA | 51842 | 0.000 | 0.000 |
| 426 | AB | 2628 | 0.494 | 0.501 |
| 426 | BB | 34573 | 0.979 | 0.997 |
| 458 | AA | 48417 | 0.000 | 0.000 |
| 458 | AB | 522 | 0.493 | 0.499 |
| 458 | BB | 43567 | 0.983 | 0.996 |
| 476 | AA | 87328 | 0.000 | 0.000 |
| 476 | AB | 9664 | 0.496 | 0.502 |
| 476 | BB | 63542 | 0.984 | 0.997 |
| 478 | AA | 54417 | 0.000 | 0.000 |
| 478 | AB | 14047 | 0.496 | 0.502 |
| 478 | BB | 36540 | 0.983 | 0.997 |
| 498 | AA | 36908 | 0.000 | 0.000 |
| 498 | AB | 11783 | 0.496 | 0.502 |
| 498 | BB | 31353 | 0.983 | 0.997 |
| 502 | AA | 33183 | 0.000 | 0.000 |
| 502 | AB | 2803 | 0.496 | 0.502 |
| 502 | BB | 20025 | 0.984 | 0.998 |
| 520 | AA | 24186 | 0.000 | 0.000 |
| 520 | AB | 5060 | 0.495 | 0.500 |
| 520 | BB | 20782 | 0.984 | 0.998 |
| 534 | AA | 39819 | 0.000 | 0.000 |
| 534 | AB | 10724 | 0.494 | 0.504 |
| 534 | BB | 26413 | 0.974 | 0.997 |
| 548 | AA | 22445 | 0.000 | 0.000 |
| 548 | AB | 6117 | 0.493 | 0.505 |
| 548 | BB | 15725 | 0.968 | 0.997 |
| 564 | AA | 24003 | 0.000 | 0.000 |
| 564 | AB | 8121 | 0.495 | 0.500 |
| 564 | BB | 14657 | 0.983 | 0.997 |
| 596 | AA | 27657 | 0.000 | 0.000 |
| 596 | AB | 4195 | 0.496 | 0.503 |
| 596 | BB | 24738 | 0.984 | 0.997 |
| 600 | AA | 23767 | 0.000 | 0.000 |
| 600 | AB | 10269 | 0.492 | 0.503 |
| 600 | BB | 16611 | 0.973 | 0.998 |
# Save the f2_homo_parents data for use in the next Rmd file and create the QAF and VAF plots for the supp info of the MS
write.csv(f2_homo_parents, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/f2_homo_parents.csv", row.names = FALSE)
Now we wish to follow the genotype in specific family, in specific
chromosome.
to do that, we need to look only at informative sites, for which we have
hypothesis about the F2 genotype, based on their parents’ (F1)
genotype.
Based on the determined site-quality threshold, we now filter the sites, and plot informative sites in one family on chromosome 1:
# Step 1: Identify informative sites: where F0 (grandmother) = 0/0 and F1 (mother) = 0/1
F0AA_F1AB <- all_param %>%
filter(generation %in% c("F0", "F1")) %>% # Keep only F0 and F1 individuals
filter(chrom %in% c("1")) %>% # Keep only chromosome 1
group_by(family, site) %>% # Group by family and site
summarise(
grandmother_homo_ref = any(GT == "AA" & generation == "F0" & sex == "female"),
mother_heterozygous = any(GT == "AB" & generation == "F1" & sex == "female"),
.groups = "drop"
) %>%
filter(grandmother_homo_ref & mother_heterozygous) %>% # Ensure criteria are met
select(family, site) # Extract only relevant columns
# Step 2: Count the number of positions per family that passed the filtering
F0AA_F1AB %>%
group_by(family) %>%
summarise(num_sites_passed = n())
## # A tibble: 30 × 2
## family num_sites_passed
## <dbl> <int>
## 1 43 35
## 2 46 16
## 3 48 11
## 4 57 21
## 5 63 915
## 6 110 10
## 7 133 647
## 8 177 71
## 9 187 267
## 10 240 42
## # ℹ 20 more rows
Now we have data on all families, will pick one family each time to plot
# set up family and members
fam_id <- 478
one_fam <- F0AA_F1AB %>% filter(family == fam_id)
# Step 3: Extract all family members' genotypes at these sites
family_sites <- one_fam %>%
inner_join(filtered_sites, by = c("family", "site")) %>% # Keep only sites that passed filtering
separate(site, into = c("chrom", "pos"), sep = "\\.1_", remove = FALSE) %>% # Split site into chrom and pos
mutate(pos = as.numeric(pos)) %>% # Convert pos to numeric
mutate(member = case_when(
str_detect(Sample, "fnd") ~ "grandmother",
str_detect(Sample, "_dat") ~ "mother",
str_detect(Sample, "_son") ~ "father",
str_detect(Sample, "_grndat") ~ "daughter",
str_detect(Sample, "_grnson") ~ "son",
TRUE ~ "unknown"
)) %>%
mutate(member = factor(member, levels = c("grandmother", "father", "mother", "son", "daughter"), ordered = TRUE)) %>%
mutate(member_num = as.numeric(member))
# Find positions that appear in all members
positions_all_members <- family_sites %>%
group_by(pos) %>%
summarise(member_count = n_distinct(member), .groups = "drop") %>%
filter(member_count == n_distinct(family_sites$member)) %>%
pull(pos)
# Filter f2_filtered to keep only rows with these positions
fam2_common <- family_sites %>% filter(pos %in% positions_all_members)
# Filter f2_filtered to keep only sites with depth higher then 20
#fam2_common <- fam2_common %>% filter(DP > 30)
# Ensure positions are sorted
fam2_common <- fam2_common %>% arrange(pos)
# Generate 30 evenly spaced indices and round them to ensure integer indices
indices <- round(seq(1, nrow(fam2_common), length.out = 30))
# Select the corresponding positions
selected_positions <- fam2_common %>% slice(indices) %>% pull(pos)
# Filter the dataset based on these selected positions
fam2_selected <- fam2_common %>% filter(pos %in% selected_positions) %>% filter(!is.na(member), !is.na(pos), !is.na(GT))
# Create a numeric x-axis for spacing (convert 'member' to factor and assign spacing)
fam2_selected %>%
filter(
(GT == "AA" & QAF < 0.03 & VAF < 0.03) | # Homozygous Reference (AA)
(GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) | # Heterozygous (AB)
(GT == "BB" & QAF > 0.8 & VAF > 0.8) # Homozygous Alternate (BB)
) %>%
ggplot(aes(x = member_num, y = pos / 1e6, color = GT)) + # Convert pos to Mb
geom_segment(aes(x = member_num - 0.3, xend = member_num + 0.3,
y = pos / 1e6, yend = pos / 1e6), size = 1.5) + # Apply same conversion
scale_x_continuous(breaks = unique(fam2_selected$member_num), labels = unique(fam2_selected$member)) + # Restore original labels
scale_color_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"), # Updated colors for renamed genotypes
name = "Genotype",
breaks = c("AA", "AB", "BB") # Ensures BB appears in the legend even if absent
) +
theme_classic() +
labs(
title = paste("Genotype Flow Across Family Members
Family ID:", fam_id),
x = "Family Member",
y = "Genomic
Position (Mb)"
) +
theme(
axis.text.x = element_text(size = 12, face = "bold", angle = 30, hjust = 1), # Rotate labels
panel.grid.major = element_blank(),
axis.line.x = element_blank(), # <-- Removes the x-axis line
axis.ticks.x = element_blank()) # Removes x-axis ticks
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
save the filtered sites,to be used in the pulled data analysis (~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/figs-for-varroa-pedigree-MS.Rmd)
# Filter sites based on QAF & VAF thresholds while keeping sites per family
filtered_sites <- all_param %>%
group_by(family, site, Sample) %>% # Keep sites grouped by family
filter(
(GT == "AA" & QAF < 0.03 & VAF < 0.03) | # Homozygous Reference (AA)
(GT == "AB" & QAF > 0.4 & QAF < 0.6 & VAF > 0.4 & VAF < 0.6) | # Heterozygous (AB)
(GT == "BB" & QAF > 0.8 & VAF > 0.8) # Homozygous Alternate (BB)
) %>%
ungroup() %>% # Ensure the result is ungrouped for later use
select(family, Sample, site, QAF, VAF, GT) # Keep family-site-sample pairs without removing duplicates
filtered_sites_summary <- filtered_sites %>%
group_by(family, Sample) %>% # Keep sites grouped by family
summarise(num_sites_kept = n(), .groups = "drop") %>% # Keep counts by Sample
mutate(family = as.character(family),
num_sites_kept = as.numeric(num_sites_kept))
# Save to CSV
write.csv(filtered_sites, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/filtered_sites.csv", row.names = FALSE)
# save the fam2_selected to make the figure 1A in the figures RMD: ~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/figs-for-varroa-pedigree-MS.Rmd
write.csv(fam2_selected, "/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/fam2_selected.csv", row.names = FALSE)