load libraries

library("tidyverse")
library("plyr")
library("dplyr")
library("ggplot2")
library("scales")
library("ggpubr")
library("gridExtra")
library("grid")
library("GGally")
library("vcfR") # for extracting genotype data from a vcf file
library("data.table")
library("stringr")
library("janitor")
library("plotly") # for the 3d plots
library("ggfortify")
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                     fig.width = 10,
                      fig.asp = 0.6,
                      out.width = "100%")

Load Variant Call Format (VCF) file.

Extract genotypes for each site and individual. The metadata for all samples can be found in here.

vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE )
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## *****        *****         *****
# extract the genotype for each site in each individual
gt <- extract.gt(vcf, element = "GT") 
gt <- as.data.frame(t(gt)) %>%
  rownames_to_column("ID")
#clean the ID names
gt$ID <- sub("_[^_]+$", "", gt$ID)

table <-  gt %>% 
  t() %>%
  as.data.frame() %>%
  row_to_names(row_number = 1) # %>%
 # dplyr::select(-c("48_49e_grn", "48_49f_grn", "48_49g_grn")) # remove samples: 48_49e_grn, 48_49f_grn, 48_49g_grn. they were born in a second cycle, and in the first cycle an adult male was born in F2. so i cannot tell for sure who is the father 

read the meta data table and set the family

meta <- read_csv("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/meta_more.csv")

# set the families 
family = grep("fnd",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

Control: Homozygotic sites (crosses 1-4)

(1) F1 male (0/0) x female (0/0)

we expect all F2 offspring to be homozygotic (0/0) like their parents (F1)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% #dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (0/0) x female (0/0)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

(2) F1 male (1/1) x female (1/1)

we expect all F2 offspring to be homozygotic (1/1) like their parents (F1)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% #dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (1/1) x female (1/1)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

(3) F1 male (0/0) x female (1/1)

We expect zero sites for this cross, since there is no paternal inheritance to the males.
Indeed, there are only 25 sites in the F2 offspring, and the genotype distribution is random.

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% #dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (0/0) x female (1/1)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

(4) F1 male (1/1) x female (0/0)

We expect zero sites for this cross, since there is no paternal inheritance to the males.
Indeed, there are only 24 sites in the F2 pooled females, and 15 for the F2 pooled males:

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (1/1) x female (0/0)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

only family 63 has sites in F2 for this cross.
something is wrong…

Crossing homozygotic male, with heterozygotic female

(5) F1 male (0/0) x female (0/1)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (0/0) x female (0/1)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

(6) F1 male (1/1) x female (0/1)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (1/1) x female (0/1)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

Crossing heterozygotic male, with homozygotic female

(7) F1 male (0/1) x female (0/0)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (0/1) x female (0/0)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

(8) F1 male (0/1) x female (1/1)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (0/1) x female (1/1)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))

Crossing heterozygotic male and female

(9) F1 male (0/1) x female (0/1)

# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
        dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
#  mutate(total = as.numeric(sum(n))) %>%
  dplyr::rename(obs = n)
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes

df <- left_join(samples, observed, by=c("sample","gt")) %>%  
    replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists 
    group_by(sample) %>%
    mutate(total = as.numeric(sum(obs))) %>%
    mutate(freq=round(obs/total, digit=2)) %>%
  left_join(meta, by="sample") %>%
    dplyr::rename(count = obs)

# order the stages 
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))

p = df %>% dplyr::filter(total>10) %>%
  ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters, 
              text = paste("sample:", sample,
                          ", n:", total))) + 
  geom_boxplot(outlier.shape = NA, coef=0 ) +
  geom_jitter(width=0.1, size=2) +
  scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
  theme_bw() +
scale_shape_manual(values=c(1,19)) +
  ggtitle('F2 offspring of F1 male (0/1) x female (0/1)') +
    facet_wrap( ~gt)  +  guides(color = "none") +
 theme(plot.margin = margin(1, 1, 1, 1, "cm"))