Main figures
Figure 1A: Crossing F1 mother AB with F1 father AA, one family
~/Documents/GitHub/varroa-pedigree-study/R_scripts/After_revision/site_quality_exploration.Rmd
# Load the fam2_selected data, for one family (478) on the largest chromosome, NW_019211454
fam_478_F0AA_F1AB <- read.csv("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/fam_478_F0AA_F1AB.csv")
# Create a numeric x-axis for spacing (convert 'member' to factor and assign spacing)
fam_id <- 478
fam_478_F0AA_F1AB %>%
ggplot(aes(x = member_num, y = pos / 1e6, color = GT)) + # Convert pos to Mb
geom_segment(aes(x = member_num - 0.3, xend = member_num + 0.3,
y = pos / 1e6, yend = pos / 1e6), size = 1.5) + # Apply same conversion
scale_x_continuous(breaks = unique(fam_478_F0AA_F1AB$member_num), labels = unique(fam_478_F0AA_F1AB$member)) + # Restore original labels
scale_color_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"), # Updated colors for renamed genotypes
name = "Genotype",
breaks = c("AA", "AB", "BB") # Ensures BB appears in the legend even if absent
) +
theme_classic() +
labs(
title = paste("Genotype Flow Across Family Members\nFamily ID:", fam_id),
x = "Family Member",
y = "Genomic Position (Mb)"
) +
theme(
axis.text.x = element_text(size = 10, angle = 45, hjust = 1, vjust = 1), # Adjusted text size and angle
axis.title.x = element_text(margin = margin(t = 15)), # Add margin above x-axis title
panel.grid.major = element_blank(),
legend.position = "right", # Move legend to the right
legend.box.margin = margin(0, 0, 0, 10), # Add margin to separate legend from plot
plot.margin = margin(10, 20, 30, 10) # Increased bottom margin to accommodate rotated labels
)
# Add caption below the plot
cat("\n**Figure 1A.** Genotype flow visualization across a single family (Family ID: 478). Each horizontal line represents a genomic position on chromosome NW_019211454.1, with colors indicating genotypes (yellow = AA, green = AB, blue = BB). Family members are arranged in generational order, showing the inheritance pattern from parents to offspring. The y-axis shows the genomic position in megabases (Mb).")
##
## **Figure 1A.** Genotype flow visualization across a single family (Family ID: 478). Each horizontal line represents a genomic position on chromosome NW_019211454.1, with colors indicating genotypes (yellow = AA, green = AB, blue = BB). Family members are arranged in generational order, showing the inheritance pattern from parents to offspring. The y-axis shows the genomic position in megabases (Mb).
Figure 1C: Crossing F1 mother AB with F1 father AA, all families
# Set the families (include only families with at least one adult F2)
family <- grep("grndat|grnson", gt$ID, value=TRUE) %>%
str_extract("[^_]+") %>%
unique()
# Define a list to store all data frames per family
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
# Add site and chromosome information
rownames_to_column("site") %>%
mutate(chrom = str_split(site, pattern = "_", simplify = TRUE)[, 2]) %>%
# Then filter for the genotype patterns we want
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "AA")) %>% # F0 Female must be AA
dplyr::filter_at(vars(matches("_son")), all_vars(. == "AA")) %>% # Sons must be AA
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "AB")) %>% # Daughters must be AB
# Select only the grandchildren columns and keep site/chrom info
dplyr::select(site, chrom, contains("grn")) %>%
# Reshape the data
tidyr::pivot_longer(cols = -c(site, chrom)) %>%
dplyr::rename(Sample = name, gt = value) %>%
# filter for sites
left_join(filtered_sites, by = c("site", "chrom", "Sample", "gt")) %>%
na.omit() %>% # Count genotypes
dplyr::count(Sample, gt, chrom, .drop = FALSE) %>%
dplyr::filter(gt %in% c("AA", "BB", "AB")) %>%
mutate(n = as.numeric(n)) %>%
group_by(Sample) %>%
mutate(total = as.numeric(sum(n))) %>%
mutate(sex = case_when(
grepl("son", Sample) ~ "Male",
grepl("dat", Sample) ~ "Female"
)) %>%
# Add family column
mutate(family = fam)
}
# Bind all families together into a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>%
mutate(Sample = as.character(Sample))
# Create samples dataframe
samples <- data.frame(
Sample = rep(unique(observed$Sample), each = 3),
gt = rep(c("AA", "AB", "BB"))
) %>%
mutate(family = str_extract(Sample, "^[0-9]+"))
# Join and create final dataset
samples_obs_filtered_ABxAA <- left_join(samples, observed, by = c("family", "Sample", "gt")) %>%
mutate(sex = case_when(
grepl("son", Sample) ~ "Male",
grepl("dat", Sample) ~ "Female"
)) %>%
group_by(Sample, chrom) %>%
mutate(total = as.numeric(sum(n))) %>%
dplyr::mutate(prop = n/total) %>%
na.omit()
### Summary of sites filtered in each sample
# Create a table showing sites filtered per sample
sites_per_sample <- table %>%
rownames_to_column("site") %>%
tidyr::pivot_longer(cols = -site, names_to = "Sample", values_to = "gt") %>%
# Filter out NA values before counting
filter(!is.na(gt)) %>%
group_by(Sample) %>%
summarise(
total_sites = n(),
.groups = 'drop'
)
filtered_sites_per_sample <- filtered_sites %>%
group_by(Sample) %>%
summarise(
filtered_sites = n(),
.groups = 'drop'
)
# Join and calculate differences
sites_comparison <- sites_per_sample %>%
left_join(filtered_sites_per_sample, by = "Sample") %>%
mutate(
filtered_sites = replace_na(filtered_sites, 0),
sites_removed = total_sites - filtered_sites,
percent_removed = round((sites_removed / total_sites) * 100, 2)
) %>%
arrange(desc(sites_removed))
# Display the table
sites_comparison %>%
select(Sample, total_sites, filtered_sites, sites_removed, percent_removed) %>%
rename(
"Sample" = Sample,
"Total Sites" = total_sites,
"Sites Kept" = filtered_sites,
"Sites Removed" = sites_removed,
"% Removed" = percent_removed
) %>%
knitr::kable(
digits = 2,
caption = "Number of sites filtered per sample",
format = "html"
) %>%
kableExtra::kable_styling(
bootstrap_options = "striped",
full_width = TRUE,
position = "left",
fixed_thead = TRUE
) %>%
kableExtra::scroll_box(
width = "100%",
height = "300px"
)
| Sample | Total Sites | Sites Kept | Sites Removed | % Removed |
|---|---|---|---|---|
| 240_241a_grndat | 27635 | 25552 | 2083 | 7.54 |
| 302_302a_son | 23714 | 21809 | 1905 | 8.03 |
| 400_400a_son | 29567 | 27804 | 1763 | 5.96 |
| 338_338_fnd | 30664 | 29027 | 1637 | 5.34 |
| 426_427a_grndat | 30732 | 29197 | 1535 | 4.99 |
| 412_412a_son | 29902 | 28446 | 1456 | 4.87 |
| 548_549_dat | 18715 | 17279 | 1436 | 7.67 |
| 458_458_fnd | 29867 | 28468 | 1399 | 4.68 |
| 412_412_fnd | 27910 | 26559 | 1351 | 4.84 |
| 400_401a_grnson | 29747 | 28407 | 1340 | 4.50 |
| 302_303b_grnson | 28367 | 27044 | 1323 | 4.66 |
| 412_413_dat | 24250 | 22953 | 1297 | 5.35 |
| 400_401_dat | 29889 | 28595 | 1294 | 4.33 |
| 458_459_dat | 30157 | 28864 | 1293 | 4.29 |
| 476_477b_grnson | 28391 | 27102 | 1289 | 4.54 |
| 534_535_2a_grndat | 20940 | 19683 | 1257 | 6.00 |
| 478_479-1a_grnson | 28898 | 27738 | 1160 | 4.01 |
| 300_300_fnd | 22043 | 20903 | 1140 | 5.17 |
| 596_596a_son | 28716 | 27597 | 1119 | 3.90 |
| 548_548_fnd | 21544 | 20461 | 1083 | 5.03 |
| 458_459a_grnson | 28336 | 27310 | 1026 | 3.62 |
| 266_267a_grndat | 16208 | 15192 | 1016 | 6.27 |
| 302_302_fnd | 18528 | 17523 | 1005 | 5.42 |
| 300_300a_son | 27961 | 26967 | 994 | 3.55 |
| 534_535_2c_grnson | 17322 | 16338 | 984 | 5.68 |
| 110_111c_grnson | 30064 | 29096 | 968 | 3.22 |
| 426_427_dat | 29893 | 28930 | 963 | 3.22 |
| 284_285b_grndat | 30316 | 29360 | 956 | 3.15 |
| 478_478a_son | 28243 | 27290 | 953 | 3.37 |
| 476_477d_grndat | 28763 | 27820 | 943 | 3.28 |
| 110_111_dat | 20695 | 19766 | 929 | 4.49 |
| 266_267_dat | 26325 | 25404 | 921 | 3.50 |
| 534_534a_son | 18886 | 17968 | 918 | 4.86 |
| 498_499c_grndat | 27543 | 26628 | 915 | 3.32 |
| 57_58_dat | 30612 | 29703 | 909 | 2.97 |
| 564_565-1_dat | 12733 | 11826 | 907 | 7.12 |
| 534_535_2_dat | 13200 | 12303 | 897 | 6.80 |
| 57_58a_grndat | 28188 | 27303 | 885 | 3.14 |
| 400_400_fnd | 27343 | 26461 | 882 | 3.23 |
| 338_339a_grndat | 30321 | 29459 | 862 | 2.84 |
| 596_597_dat | 26693 | 25846 | 847 | 3.17 |
| 63_63a_son | 29482 | 28639 | 843 | 2.86 |
| 476_476a_son | 24736 | 23894 | 842 | 3.40 |
| 476_477a_grndat | 25948 | 25115 | 833 | 3.21 |
| 426_427b_grnson | 16382 | 15572 | 810 | 4.94 |
| 520_520a_son | 14326 | 13520 | 806 | 5.63 |
| 548_548a_son | 11629 | 10825 | 804 | 6.91 |
| 63_64_dat | 30582 | 29786 | 796 | 2.60 |
| 240_240_fnd | 30066 | 29276 | 790 | 2.63 |
| 498_499a_grnson | 27913 | 27126 | 787 | 2.82 |
| 43_44_dat | 28703 | 27928 | 775 | 2.70 |
| 110_111d_grndat | 26068 | 25302 | 766 | 2.94 |
| 133_134_dat | 24195 | 23445 | 750 | 3.10 |
| 478_478_fnd | 25274 | 24527 | 747 | 2.96 |
| 284_284a_son | 28711 | 27966 | 745 | 2.59 |
| 46_46a_son | 22390 | 21653 | 737 | 3.29 |
| 338_339_dat | 30047 | 29311 | 736 | 2.45 |
| 338_339b_grnson | 10129 | 9397 | 732 | 7.23 |
| 520_520_fnd | 29042 | 28319 | 723 | 2.49 |
| 177_178_dat | 29476 | 28755 | 721 | 2.45 |
| 600_600a_son | 14625 | 13907 | 718 | 4.91 |
| 478_479-1b_grndat | 14808 | 14095 | 713 | 4.81 |
| 564_565-1b_grnson | 25552 | 24841 | 711 | 2.78 |
| 240_240a_son | 23131 | 22422 | 709 | 3.07 |
| 110_110a_son | 9180 | 8472 | 708 | 7.71 |
| 534_534_fnd | 19995 | 19290 | 705 | 3.53 |
| 187_187_fnd | 20910 | 20206 | 704 | 3.37 |
| 478_479-1_dat | 27355 | 26658 | 697 | 2.55 |
| 600_601_dat | 21128 | 20443 | 685 | 3.24 |
| 43_44a_grndat | 28741 | 28059 | 682 | 2.37 |
| 476_477c_grndat | 27817 | 27140 | 677 | 2.43 |
| 133_133_fnd | 25282 | 24609 | 673 | 2.66 |
| 502_503f_grndat | 24641 | 23968 | 673 | 2.73 |
| 564_564a_son | 11455 | 10782 | 673 | 5.88 |
| 284_285_dat | 29352 | 28681 | 671 | 2.29 |
| 498_498a_son | 6923 | 6254 | 669 | 9.66 |
| 266_267b_grndat | 29943 | 29277 | 666 | 2.22 |
| 600_600_fnd | 13659 | 12997 | 662 | 4.85 |
| 338_338a_son | 15304 | 14649 | 655 | 4.28 |
| 46_47_dat | 29051 | 28412 | 639 | 2.20 |
| 476_476_fnd | 27912 | 27275 | 637 | 2.28 |
| 46_47d_grnson | 22004 | 21372 | 632 | 2.87 |
| 133_133a_son | 28760 | 28139 | 621 | 2.16 |
| 498_499b_grndat | 10473 | 9853 | 620 | 5.92 |
| 476_477_dat | 15022 | 14411 | 611 | 4.07 |
| 284_285a_grnson | 18988 | 18381 | 607 | 3.20 |
| 322_323a_grndat | 15980 | 15374 | 606 | 3.79 |
| 564_564_fnd | 10253 | 9653 | 600 | 5.85 |
| 48_48a_son | 17227 | 16636 | 591 | 3.43 |
| 187_188_dat | 30316 | 29733 | 583 | 1.92 |
| 458_458a_son | 7966 | 7387 | 579 | 7.27 |
| 520_521_dat | 15022 | 14448 | 574 | 3.82 |
| 177_177a_son | 30198 | 29626 | 572 | 1.89 |
| 48_49_dat | 28312 | 27742 | 570 | 2.01 |
| 302_303_dat | 25317 | 24761 | 556 | 2.20 |
| 426_426_fnd | 12548 | 12000 | 548 | 4.37 |
| 240_241_dat | 26002 | 25459 | 543 | 2.09 |
| 133_134a_grnson | 8917 | 8380 | 537 | 6.02 |
| 322_322a_son | 17800 | 17268 | 532 | 2.99 |
| 63_63_fnd | 20105 | 19586 | 519 | 2.58 |
| 600_601a_grnson | 12873 | 12395 | 478 | 3.71 |
| 43_44b_grnson | 11657 | 11180 | 477 | 4.09 |
| 502_503_dat | 11568 | 11100 | 468 | 4.05 |
| 57_58b_grnson | 9562 | 9096 | 466 | 4.87 |
| 177_177_fnd | 29519 | 29056 | 463 | 1.57 |
| 46_47c_grndat | 27492 | 27034 | 458 | 1.67 |
| 498_499_dat | 20238 | 19780 | 458 | 2.26 |
| 502_503b_grnson | 8277 | 7830 | 447 | 5.40 |
| 43_43a_son | 9442 | 8998 | 444 | 4.70 |
| 412_413a_grnson | 8076 | 7649 | 427 | 5.29 |
| 57_57_fnd | 8619 | 8235 | 384 | 4.46 |
| 240_241b_grndat | 5219 | 4852 | 367 | 7.03 |
| 502_503a_grndat | 6622 | 6266 | 356 | 5.38 |
| 498_498_fnd | 3780 | 3426 | 354 | 9.37 |
| 57_57a_son | 5418 | 5071 | 347 | 6.40 |
| 596_596_fnd | 5432 | 5092 | 340 | 6.26 |
| 48_48_fnd | 6036 | 5717 | 319 | 5.28 |
| 300_301a_grnson | 3879 | 3565 | 314 | 8.09 |
| 46_46_fnd | 9008 | 8709 | 299 | 3.32 |
| 426_426a_son | 5075 | 4787 | 288 | 5.67 |
| 284_284_fnd | 21473 | 21195 | 278 | 1.29 |
| 502_502a_son | 6414 | 6144 | 270 | 4.21 |
| 110_110_fnd | 2163 | 1904 | 259 | 11.97 |
| 187_187a_son | 5883 | 5634 | 249 | 4.23 |
| 502_502_fnd | 4941 | 4698 | 243 | 4.92 |
| 322_323_dat | 5555 | 5381 | 174 | 3.13 |
| 187_188a_grnson | 1898 | 1726 | 172 | 9.06 |
| 596_597a_grnson | 3572 | 3422 | 150 | 4.20 |
| 43_43_fnd | 2310 | 2161 | 149 | 6.45 |
| 240_241c_grnson | 3751 | 3605 | 146 | 3.89 |
| 300_301_dat | 2112 | 1989 | 123 | 5.82 |
| 322_323b_grnson | 6074 | 5953 | 121 | 1.99 |
| 266_266_fnd | 1816 | 1731 | 85 | 4.68 |
| 177_178b_grnson | 1422 | 1349 | 73 | 5.13 |
| 322_322_fnd | 2236 | 2164 | 72 | 3.22 |
| 266_266a_son | 1440 | 1372 | 68 | 4.72 |
| 177_178a_grndat | 1364 | 1302 | 62 | 4.55 |
# Set minimum sites threshold
site_threshold = as.numeric(10)
# Create summary statistics for filtered data
summary_df_filtered <- samples_obs_filtered_ABxAA %>%
filter(total >= site_threshold) %>%
group_by(sex) %>%
summarise(
num_samples = n_distinct(Sample), # Changed to Sample with capital S
total_sites = sum(total),
.groups = "drop"
)
# Create summary statistics for filtered data by chromosome
summary_df_filtered_by_chrom <- samples_obs_filtered_ABxAA %>%
filter(total >= site_threshold) %>%
group_by(sex, chrom) %>%
summarise(
num_samples = n_distinct(Sample), # Changed to Sample with capital S
total_sites = sum(total),
.groups = "drop"
)
# Pulled data with totals across all chromosomes
ggplot(samples_obs_filtered_ABxAA %>% filter(total >= site_threshold),
aes(fill = gt, y = prop, x = factor(sex, levels = c("Female", "Male")))) +
geom_bar(position = "fill", stat = "identity", width = 0.8) +
ylab("Genotype Proportion") +
labs(title = "Offspring Genotype, Crossing (AB) Male x (AA) Female
Filtered sites",
fill = "Genotype") +
theme_classic() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 14),
strip.text.x = element_text(size = 12),
panel.spacing = unit(1, "lines"),
plot.margin = margin(t = 10, r = 10, b = 0, l = 10)
) +
scale_fill_manual(values = c("#ffbf00", "#66b032", "#1982c4")) +
# Add sample count labels
geom_text(data = summary_df_filtered,
aes(x = sex,
y = -0.03,
label = paste0(num_samples, " ", sex, "s")),
inherit.aes = FALSE,
size = 2.8,
vjust = 1) +
# Add sites count labels
geom_text(data = summary_df_filtered,
aes(x = sex,
y = -0.18,
label = paste0(format(total_sites, big.mark = ","), " sites")),
inherit.aes = FALSE,
size = 2.8,
vjust = 0.5)
# clean plot without labels , for the MS
ggplot(samples_obs_filtered_ABxAA %>% filter(total >= 10),
aes(fill = gt, y = prop, x = factor(sex, levels = c("Female", "Male")))) +
geom_bar(position = "fill", stat = "identity", width = 0.7) +
ylab("Genotype
Proportion") +
labs(title = "Offspring Genotype, Crossing (AA) Male x (AB) Female
Filtered sites",
fill = "Genotype") +
theme_classic() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16),
strip.text.x = element_text(size = 12),
panel.spacing = unit(1, "lines"),
plot.margin = margin(t = 10, r = 10, b = 0, l = 10)
) +
scale_fill_manual(values = c("#ffbf00", "#66b032", "#1982c4"))# +
#facet_wrap(~family)
Figure 1B: Crossing F1 mother AA with F1 father AB, one family
# Load the fam2_selected data, for one family (534) on the largest chromosome, NW_019211454
fam_534_F0AA_F1AA <- read.csv("/Users/nuriteliash/Documents/GitHub/varroa-pedigree-study/data/fam_534_F0AA_F1AA.csv")
# Create a numeric x-axis for spacing (convert 'member' to factor and assign spacing)
fam_id <- 534
fam_534_F0AA_F1AA %>%
ggplot(aes(x = member_num, y = pos / 1e6, color = GT)) + # Convert pos to Mb
geom_segment(aes(x = member_num - 0.3, xend = member_num + 0.3,
y = pos / 1e6, yend = pos / 1e6), size = 1.5) + # Apply same conversion
scale_x_continuous(breaks = unique(fam_534_F0AA_F1AA$member_num), labels = unique(fam_534_F0AA_F1AA$member)) + # Restore original labels
scale_color_manual(
values = c("AA" = "#ffbf00", "AB" = "#66b032", "BB" = "#1982c4"), # Updated colors for renamed genotypes
name = "Genotype",
breaks = c("AA", "AB", "BB") # Ensures BB appears in the legend even if absent
) +
theme_classic() +
labs(
title = paste("Genotype Flow Across Family Members\nFamily ID:", fam_id),
x = "Family Member",
y = "Genomic Position (Mb)"
) +
theme(
axis.text.x = element_text(size = 10, angle = 45, hjust = 1, vjust = 1), # Adjusted text size and angle
axis.title.x = element_text(margin = margin(t = 15)), # Add margin above x-axis title
panel.grid.major = element_blank(),
legend.position = "right", # Move legend to the right
legend.box.margin = margin(0, 0, 0, 10), # Add margin to separate legend from plot
plot.margin = margin(10, 20, 30, 10) # Increased bottom margin to accommodate rotated labels
)
# Add caption below the plot
cat("\n**Figure 1A.** Genotype flow visualization across a single family (Family ID: 534). Each horizontal line represents a genomic position on chromosome NW_019211454.1, with colors indicating genotypes (yellow = AA, green = AB, blue = BB). Family members are arranged in generational order, showing the inheritance pattern from parents to offspring. The y-axis shows the genomic position in megabases (Mb).")
##
## **Figure 1A.** Genotype flow visualization across a single family (Family ID: 534). Each horizontal line represents a genomic position on chromosome NW_019211454.1, with colors indicating genotypes (yellow = AA, green = AB, blue = BB). Family members are arranged in generational order, showing the inheritance pattern from parents to offspring. The y-axis shows the genomic position in megabases (Mb).
Figure 1D: Crossing F1 mother AA with F1 father AB, all families
# Set the families (include only families with at least one adult F2)
family <- grep("grndat|grnson", gt$ID, value=TRUE) %>%
str_extract("[^_]+") %>%
unique()
# Define a list to store all data frames per family
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
# Add site and chromosome information
rownames_to_column("site") %>%
mutate(chrom = str_split(site, pattern = "_", simplify = TRUE)[, 2]) %>%
# Then filter for the genotype patterns we want
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "AB")) %>% # F0 Female must be AB
dplyr::filter_at(vars(matches("_son")), all_vars(. == "AB")) %>% # Sons must be AB
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "AA")) %>% # Daughters must be AA
# Select only the grandchildren columns and keep site/chrom info
dplyr::select(site, chrom, contains("grn")) %>%
# Reshape the data
tidyr::pivot_longer(cols = -c(site, chrom)) %>%
dplyr::rename(Sample = name, gt = value) %>%
# filter for sites
left_join(filtered_sites, by = c("site", "chrom", "Sample", "gt")) %>%
na.omit() %>% # Count genotypes
dplyr::count(Sample, gt, chrom, .drop = FALSE) %>%
dplyr::filter(gt %in% c("AA", "BB", "AB")) %>%
mutate(n = as.numeric(n)) %>%
group_by(Sample) %>%
mutate(total = as.numeric(sum(n))) %>%
mutate(sex = case_when(
grepl("son", Sample) ~ "Male",
grepl("dat", Sample) ~ "Female"
)) %>%
# Add family column
mutate(family = fam)
}
# Bind all families together into a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>%
mutate(Sample = as.character(Sample))
# Create samples dataframe
samples <- data.frame(
Sample = rep(unique(observed$Sample), each = 3),
gt = rep(c("AA", "AB", "BB"))
) %>%
mutate(family = str_extract(Sample, "^[0-9]+"))
# Join and create final dataset
samples_obs_filtered_AAxAB <- left_join(samples, observed, by = c("family", "Sample", "gt")) %>%
mutate(sex = case_when(
grepl("son", Sample) ~ "Male",
grepl("dat", Sample) ~ "Female"
)) %>%
group_by(Sample, chrom) %>%
mutate(total = as.numeric(sum(n))) %>%
dplyr::mutate(prop = n/total) %>%
na.omit()
### Summary of sites filtered in each sample
# Create a table showing sites filtered per sample
sites_per_sample <- table %>%
rownames_to_column("site") %>%
tidyr::pivot_longer(cols = -site, names_to = "Sample", values_to = "gt") %>%
# Filter out NA values before counting
filter(!is.na(gt)) %>%
group_by(Sample) %>%
summarise(
total_sites = n(),
.groups = 'drop'
)
filtered_sites_per_sample <- filtered_sites %>%
group_by(Sample) %>%
summarise(
filtered_sites = n(),
.groups = 'drop'
)
# Join and calculate differences
sites_comparison <- sites_per_sample %>%
left_join(filtered_sites_per_sample, by = "Sample") %>%
mutate(
filtered_sites = replace_na(filtered_sites, 0),
sites_removed = total_sites - filtered_sites,
percent_removed = round((sites_removed / total_sites) * 100, 2)
) %>%
arrange(desc(sites_removed))
# Display the table
sites_comparison %>%
select(Sample, total_sites, filtered_sites, sites_removed, percent_removed) %>%
rename(
"Sample" = Sample,
"Total Sites" = total_sites,
"Sites Kept" = filtered_sites,
"Sites Removed" = sites_removed,
"% Removed" = percent_removed
) %>%
knitr::kable(
digits = 2,
caption = "Number of sites filtered per sample",
format = "html"
) %>%
kableExtra::kable_styling(
bootstrap_options = "striped",
full_width = TRUE,
position = "left",
fixed_thead = TRUE
) %>%
kableExtra::scroll_box(
width = "100%",
height = "300px"
)
| Sample | Total Sites | Sites Kept | Sites Removed | % Removed |
|---|---|---|---|---|
| 240_241a_grndat | 27635 | 25552 | 2083 | 7.54 |
| 302_302a_son | 23714 | 21809 | 1905 | 8.03 |
| 400_400a_son | 29567 | 27804 | 1763 | 5.96 |
| 338_338_fnd | 30664 | 29027 | 1637 | 5.34 |
| 426_427a_grndat | 30732 | 29197 | 1535 | 4.99 |
| 412_412a_son | 29902 | 28446 | 1456 | 4.87 |
| 548_549_dat | 18715 | 17279 | 1436 | 7.67 |
| 458_458_fnd | 29867 | 28468 | 1399 | 4.68 |
| 412_412_fnd | 27910 | 26559 | 1351 | 4.84 |
| 400_401a_grnson | 29747 | 28407 | 1340 | 4.50 |
| 302_303b_grnson | 28367 | 27044 | 1323 | 4.66 |
| 412_413_dat | 24250 | 22953 | 1297 | 5.35 |
| 400_401_dat | 29889 | 28595 | 1294 | 4.33 |
| 458_459_dat | 30157 | 28864 | 1293 | 4.29 |
| 476_477b_grnson | 28391 | 27102 | 1289 | 4.54 |
| 534_535_2a_grndat | 20940 | 19683 | 1257 | 6.00 |
| 478_479-1a_grnson | 28898 | 27738 | 1160 | 4.01 |
| 300_300_fnd | 22043 | 20903 | 1140 | 5.17 |
| 596_596a_son | 28716 | 27597 | 1119 | 3.90 |
| 548_548_fnd | 21544 | 20461 | 1083 | 5.03 |
| 458_459a_grnson | 28336 | 27310 | 1026 | 3.62 |
| 266_267a_grndat | 16208 | 15192 | 1016 | 6.27 |
| 302_302_fnd | 18528 | 17523 | 1005 | 5.42 |
| 300_300a_son | 27961 | 26967 | 994 | 3.55 |
| 534_535_2c_grnson | 17322 | 16338 | 984 | 5.68 |
| 110_111c_grnson | 30064 | 29096 | 968 | 3.22 |
| 426_427_dat | 29893 | 28930 | 963 | 3.22 |
| 284_285b_grndat | 30316 | 29360 | 956 | 3.15 |
| 478_478a_son | 28243 | 27290 | 953 | 3.37 |
| 476_477d_grndat | 28763 | 27820 | 943 | 3.28 |
| 110_111_dat | 20695 | 19766 | 929 | 4.49 |
| 266_267_dat | 26325 | 25404 | 921 | 3.50 |
| 534_534a_son | 18886 | 17968 | 918 | 4.86 |
| 498_499c_grndat | 27543 | 26628 | 915 | 3.32 |
| 57_58_dat | 30612 | 29703 | 909 | 2.97 |
| 564_565-1_dat | 12733 | 11826 | 907 | 7.12 |
| 534_535_2_dat | 13200 | 12303 | 897 | 6.80 |
| 57_58a_grndat | 28188 | 27303 | 885 | 3.14 |
| 400_400_fnd | 27343 | 26461 | 882 | 3.23 |
| 338_339a_grndat | 30321 | 29459 | 862 | 2.84 |
| 596_597_dat | 26693 | 25846 | 847 | 3.17 |
| 63_63a_son | 29482 | 28639 | 843 | 2.86 |
| 476_476a_son | 24736 | 23894 | 842 | 3.40 |
| 476_477a_grndat | 25948 | 25115 | 833 | 3.21 |
| 426_427b_grnson | 16382 | 15572 | 810 | 4.94 |
| 520_520a_son | 14326 | 13520 | 806 | 5.63 |
| 548_548a_son | 11629 | 10825 | 804 | 6.91 |
| 63_64_dat | 30582 | 29786 | 796 | 2.60 |
| 240_240_fnd | 30066 | 29276 | 790 | 2.63 |
| 498_499a_grnson | 27913 | 27126 | 787 | 2.82 |
| 43_44_dat | 28703 | 27928 | 775 | 2.70 |
| 110_111d_grndat | 26068 | 25302 | 766 | 2.94 |
| 133_134_dat | 24195 | 23445 | 750 | 3.10 |
| 478_478_fnd | 25274 | 24527 | 747 | 2.96 |
| 284_284a_son | 28711 | 27966 | 745 | 2.59 |
| 46_46a_son | 22390 | 21653 | 737 | 3.29 |
| 338_339_dat | 30047 | 29311 | 736 | 2.45 |
| 338_339b_grnson | 10129 | 9397 | 732 | 7.23 |
| 520_520_fnd | 29042 | 28319 | 723 | 2.49 |
| 177_178_dat | 29476 | 28755 | 721 | 2.45 |
| 600_600a_son | 14625 | 13907 | 718 | 4.91 |
| 478_479-1b_grndat | 14808 | 14095 | 713 | 4.81 |
| 564_565-1b_grnson | 25552 | 24841 | 711 | 2.78 |
| 240_240a_son | 23131 | 22422 | 709 | 3.07 |
| 110_110a_son | 9180 | 8472 | 708 | 7.71 |
| 534_534_fnd | 19995 | 19290 | 705 | 3.53 |
| 187_187_fnd | 20910 | 20206 | 704 | 3.37 |
| 478_479-1_dat | 27355 | 26658 | 697 | 2.55 |
| 600_601_dat | 21128 | 20443 | 685 | 3.24 |
| 43_44a_grndat | 28741 | 28059 | 682 | 2.37 |
| 476_477c_grndat | 27817 | 27140 | 677 | 2.43 |
| 133_133_fnd | 25282 | 24609 | 673 | 2.66 |
| 502_503f_grndat | 24641 | 23968 | 673 | 2.73 |
| 564_564a_son | 11455 | 10782 | 673 | 5.88 |
| 284_285_dat | 29352 | 28681 | 671 | 2.29 |
| 498_498a_son | 6923 | 6254 | 669 | 9.66 |
| 266_267b_grndat | 29943 | 29277 | 666 | 2.22 |
| 600_600_fnd | 13659 | 12997 | 662 | 4.85 |
| 338_338a_son | 15304 | 14649 | 655 | 4.28 |
| 46_47_dat | 29051 | 28412 | 639 | 2.20 |
| 476_476_fnd | 27912 | 27275 | 637 | 2.28 |
| 46_47d_grnson | 22004 | 21372 | 632 | 2.87 |
| 133_133a_son | 28760 | 28139 | 621 | 2.16 |
| 498_499b_grndat | 10473 | 9853 | 620 | 5.92 |
| 476_477_dat | 15022 | 14411 | 611 | 4.07 |
| 284_285a_grnson | 18988 | 18381 | 607 | 3.20 |
| 322_323a_grndat | 15980 | 15374 | 606 | 3.79 |
| 564_564_fnd | 10253 | 9653 | 600 | 5.85 |
| 48_48a_son | 17227 | 16636 | 591 | 3.43 |
| 187_188_dat | 30316 | 29733 | 583 | 1.92 |
| 458_458a_son | 7966 | 7387 | 579 | 7.27 |
| 520_521_dat | 15022 | 14448 | 574 | 3.82 |
| 177_177a_son | 30198 | 29626 | 572 | 1.89 |
| 48_49_dat | 28312 | 27742 | 570 | 2.01 |
| 302_303_dat | 25317 | 24761 | 556 | 2.20 |
| 426_426_fnd | 12548 | 12000 | 548 | 4.37 |
| 240_241_dat | 26002 | 25459 | 543 | 2.09 |
| 133_134a_grnson | 8917 | 8380 | 537 | 6.02 |
| 322_322a_son | 17800 | 17268 | 532 | 2.99 |
| 63_63_fnd | 20105 | 19586 | 519 | 2.58 |
| 600_601a_grnson | 12873 | 12395 | 478 | 3.71 |
| 43_44b_grnson | 11657 | 11180 | 477 | 4.09 |
| 502_503_dat | 11568 | 11100 | 468 | 4.05 |
| 57_58b_grnson | 9562 | 9096 | 466 | 4.87 |
| 177_177_fnd | 29519 | 29056 | 463 | 1.57 |
| 46_47c_grndat | 27492 | 27034 | 458 | 1.67 |
| 498_499_dat | 20238 | 19780 | 458 | 2.26 |
| 502_503b_grnson | 8277 | 7830 | 447 | 5.40 |
| 43_43a_son | 9442 | 8998 | 444 | 4.70 |
| 412_413a_grnson | 8076 | 7649 | 427 | 5.29 |
| 57_57_fnd | 8619 | 8235 | 384 | 4.46 |
| 240_241b_grndat | 5219 | 4852 | 367 | 7.03 |
| 502_503a_grndat | 6622 | 6266 | 356 | 5.38 |
| 498_498_fnd | 3780 | 3426 | 354 | 9.37 |
| 57_57a_son | 5418 | 5071 | 347 | 6.40 |
| 596_596_fnd | 5432 | 5092 | 340 | 6.26 |
| 48_48_fnd | 6036 | 5717 | 319 | 5.28 |
| 300_301a_grnson | 3879 | 3565 | 314 | 8.09 |
| 46_46_fnd | 9008 | 8709 | 299 | 3.32 |
| 426_426a_son | 5075 | 4787 | 288 | 5.67 |
| 284_284_fnd | 21473 | 21195 | 278 | 1.29 |
| 502_502a_son | 6414 | 6144 | 270 | 4.21 |
| 110_110_fnd | 2163 | 1904 | 259 | 11.97 |
| 187_187a_son | 5883 | 5634 | 249 | 4.23 |
| 502_502_fnd | 4941 | 4698 | 243 | 4.92 |
| 322_323_dat | 5555 | 5381 | 174 | 3.13 |
| 187_188a_grnson | 1898 | 1726 | 172 | 9.06 |
| 596_597a_grnson | 3572 | 3422 | 150 | 4.20 |
| 43_43_fnd | 2310 | 2161 | 149 | 6.45 |
| 240_241c_grnson | 3751 | 3605 | 146 | 3.89 |
| 300_301_dat | 2112 | 1989 | 123 | 5.82 |
| 322_323b_grnson | 6074 | 5953 | 121 | 1.99 |
| 266_266_fnd | 1816 | 1731 | 85 | 4.68 |
| 177_178b_grnson | 1422 | 1349 | 73 | 5.13 |
| 322_322_fnd | 2236 | 2164 | 72 | 3.22 |
| 266_266a_son | 1440 | 1372 | 68 | 4.72 |
| 177_178a_grndat | 1364 | 1302 | 62 | 4.55 |
# Set minimum sites threshold
site_threshold = as.numeric(10)
# Create summary dataframe for labels, summing across all chromosomes (for pulled data only)
summary_df_filtered <- samples_obs_filtered_AAxAB %>%
filter(total >= site_threshold) %>%
group_by(sex) %>% # Remove chromosome grouping
summarise(
num_samples = n_distinct(Sample), # Count unique samples
total_sites = sum(total, na.rm = TRUE), # Sum all sites
.groups = 'drop'
)
# Create per-chromosome summary for faceted plot
summary_df_filtered_by_chrom <- samples_obs_filtered_AAxAB %>%
group_by(chrom, sex) %>%
summarise(
num_samples = n_distinct(Sample),
total_sites = sum(total, na.rm = TRUE),
.groups = 'drop'
)
# Plot 1: Pulled data with totals across all chromosomes
ggplot(samples_obs_filtered_AAxAB %>% filter(total >= site_threshold),
aes(fill = gt, y = prop, x = factor(sex, levels = c("Female", "Male")))) +
geom_bar(position = "fill", stat = "identity", width = 0.5) +
ylab("Genotype Proportion") +
labs(title = "Offspring Genotype, Crossing (AB) Male x (AA) Female
Filtered sites",
fill = "Genotype") +
theme_classic() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 12),
strip.text.x = element_text(size = 12),
panel.spacing = unit(1, "lines"),
plot.margin = margin(t = 10, r = 10, b = 60, l = 10)
) +
scale_fill_manual(values = c("#ffbf00", "#66b032", "#1982c4")) +
# Add sample count labels (only for pulled data)
geom_text(data = summary_df_filtered,
aes(x = sex,
y = -0.03,
label = paste0(num_samples, " ", sex, "s")),
inherit.aes = FALSE,
size = 2.8,
vjust = 1) +
# Add total sites label with comma formatting (only for pulled data)
geom_text(data = summary_df_filtered,
aes(x = sex,
y = -0.18,
label = paste0(format(total_sites, big.mark = ","), " sites")),
inherit.aes = FALSE,
size = 2.8,
vjust = 0.5)
# clean plot without labels , for the MS
ggplot(samples_obs_filtered_AAxAB %>% filter(total >= site_threshold),
aes(fill = gt, y = prop, x = factor(sex, levels = c("Female", "Male")))) +
geom_bar(position = "fill", stat = "identity", width = 0.7) +
ylab("Genotype
Proportion") +
labs(title = "Offspring Genotype, Crossing (AB) Male x (AA) Female
Filtered sites",
fill = "Genotype") +
theme_classic() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16),
strip.text.x = element_text(size = 12),
panel.spacing = unit(1, "lines"),
plot.margin = margin(t = 10, r = 10, b = 0, l = 10)
) +
scale_fill_manual(values = c("#ffbf00", "#66b032", "#1982c4"))# +
#facet_wrap(~family)
Figure 2: RNAseq of males and females
rna_dat <- read_csv("/Users/nuriteliash/Documents/GitHub/Varroa-RNAseq-Zheng-2025/data/rna-seq.csv")
rna_dat_summary <- rna_dat %>%
group_by(family, sex) %>%
summarize(
het_prop = sum(het) / sum(het + homo), # Proportion of heterozygous
count = sum(het + homo), # Total trials
sem = sqrt((het_prop * (1 - het_prop)) / count), # Standard error of the mean
ci_lower = het_prop - qt(0.975, df = count - 1) * sem, # Lower confidence interval
ci_upper = het_prop + qt(0.975, df = count - 1) * sem # Upper confidence interval
)
# Plot the data
rna_dat_summary %>%
ggplot(aes(x = sex, y = het_prop, colour = sex, group = family)) +
geom_line(position = position_dodge(width = 0.5), colour = "black") +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = 1, position = position_dodge(width = 0.5)) +
theme_bw() +
ylab("Proportion of heterozygous sites") +
xlab("Sex") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Male" = "blue", "Female" = "#E31C79")) + # Custom colors
theme(legend.position = "none")
cat("\n**Figure 2.** RNA-seq analysis of heterozygous site proportions in male and female mites. Each point represents the mean proportion of heterozygous sites per individual mite, with error bars showing 95% confidence intervals. Points are connected by lines to show family relatedness of mother and son. The y-axis shows the proportion of heterozygous sites as a percentage of total sites analyzed.")
##
## **Figure 2.** RNA-seq analysis of heterozygous site proportions in male and female mites. Each point represents the mean proportion of heterozygous sites per individual mite, with error bars showing 95% confidence intervals. Points are connected by lines to show family relatedness of mother and son. The y-axis shows the proportion of heterozygous sites as a percentage of total sites analyzed.
Do heterozygocity proportion differ significantly between male and female mites?
# Reshape to wide format
df_wide <- rna_dat_summary %>%
select(family, sex, het_prop) %>%
pivot_wider(names_from = sex, values_from = het_prop)
# Paired t-test
t.test(df_wide$Female, df_wide$Male, paired = TRUE)
##
## Paired t-test
##
## data: df_wide$Female and df_wide$Male
## t = 0.45559, df = 4, p-value = 0.6723
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.1116696 0.1555112
## sample estimates:
## mean difference
## 0.02192078