load libraries

library("tidyverse")
library("plyr")
library("dplyr")
library("ggplot2")
library("scales")
library("ggpubr")
library("gridExtra")
library("grid")
library("GGally")
library("vcfR") # for extracting genotype data from a vcf file
library("data.table")
library("stringr")
library("janitor")
library("gmodels")
library("rstatix")
library("freqtables")
library("broom")

#library("patchwork") # for gathering the plots
library("cowplot") 
#library("aspi") # Repeated G–tests of Goodness-of-Fit, work only for 2 variables..
#library("RVAideMemoire") # Repeated G–tests of Goodness-of-Fit, work only for 2 variables...
#library("InfoTrad")
#library("ggthemes") # for more colors in the ggplot
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                     fig.width = 10,
                      fig.asp = 0.8,
                      out.width = "100%")
#fig.width = 6,fig.asp = 0.8,out.width = "100%"

the VCF was filtered :

vcftools --vcf snponly_freebayes.vcf --chr NW_019211454.1 --chr NW_019211455.1 --chr NW_019211456.1 --chr NW_019211457.1 --chr NW_019211458.1 --chr NW_019211459.1 --chr NW_019211460.1 --max-alleles 2 --minQ 15000 --minDP 16 --maxDP 40 --max-missing 0.5 --maf 0.2 --recode --recode-INFO-all --out Q15000BIALLDP16HDP40mis.5maf.2Chr7  

Load Variant Call Format (VCF) file.

vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/minQ_filter/Q15000BIALLDP16HDP40mis.5maf.2Chr7.recode.vcf", verbose = FALSE )
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 33,925 variants
## Object size: 272.8 Mb
## 0 percent missing data
## *****        *****         *****
# extract the genotype for each site in each individual
gt <- extract.gt(vcf, element = "GT") 
gt <- as.data.frame(t(gt)) %>%
  rownames_to_column("ID")
#clean the ID names
gt$ID <- sub("_[^_]+$", "", gt$ID)

table <-  gt %>% 
  t() %>%
  as.data.frame() %>%
  row_to_names(row_number = 1) %>% 
  dplyr::select(contains(c("son", "dat", "fnd"))) # keep only adults of F0, F1 and F2 

# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
  str_extract("[^_]+")  %>%
  unique()

# or, include all F2 samples, but indicate if they have an adult sister (may indicate if the F1 female was fertilized)
#family = grep("grn",gt$ID, value=TRUE) %>%
#  str_extract("[^_]+")  %>%
#  unique()

‘Sanity check’: Crossing homozygotic sites (crosses 1-4)

(1) F1 male (0/0) x female (0/0)

we expect all F2 offspring to be homozygotic (0/0) like their parents (F1)

# define a list to put all the data frames in
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/0") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male   35620
## 2 female 35003
p_00_00 =  samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (0/0) x F1 female (0/0)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

(2) F1 male (1/1) x female (1/1)

we expect all F2 offspring to be homozygotic (1/1) like their parents (F1)

# define a list to put all the data frames in
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="1/1") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male   24520
## 2 female 23344
p_11_11 =  samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (1/1) x F1 female (1/1)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

(3) F1 male (0/0) x female (1/1)

We expect zero sites for this cross, because the F1 are siblings.
Indeed, there are only 23 sites in the F2 pooled females, and 14 for the F2 pooled males:

# define a list to put all the data frames in
# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/0") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 0 × 2
## # Groups:   sex [0]
## # … with 2 variables: sex <chr>, sites <dbl>
p_00_11 =  samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (0/0) x F1 female (1/1)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

(4) F1 male (1/1) x female (0/0)

We expect zero sites for this cross, since there is no paternal inheritance to the males.

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="1/1") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 0 × 2
## # Groups:   sex [0]
## # … with 2 variables: sex <chr>, sites <dbl>
p_11_00 = samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (1/1) x F1 female (0/0)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

Crossing homozygotic male, with heterozygotic female

(5) F1 male (0/0) x female (0/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/0") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male    5691
## 2 female  4942
p_00_01 = samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (0/0) x F1 female (0/1)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

(6) F1 male (1/1) x female (0/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/1") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male    6101
## 2 female  5389
p_11_01 = samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (1/1) x F1 female (0/1)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

Crossing heterozygotic male, with homozygotic female

The former crosses of heterozygotic females (5 and 6) show that F2 males can be heterozygotic and carry two alleles, like their mother.
But are these sites real? and if they are, can they be transmitted to their daughters?

(7) F1 male (0/1) x female (0/0)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/0") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male    1978
## 2 female  2465
p_01_00 = samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (0/1) x F1 female (0/0)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

(8) F1 male (0/1) x female (1/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/1") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male    1573
## 2 female  2391
p_01_11 = samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (0/1) x F1 female (1/1)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

Crossing heterozygotic male and female

(9) F1 male (0/1) x female (0/1)

# define a list to put all the data frames in
obs <- list()

for (fam in family) {
 
obs[[fam]] <- table %>%
  dplyr::select(starts_with(fam)) %>%
    dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
  dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
  dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>% 
  dplyr::select(contains("grn")) %>%
  tidyr::pivot_longer(everything())  %>% 
  dplyr::rename(sample = name, gt = value) %>%
  dplyr::count(sample, gt, .drop = FALSE) %>% 
  dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
  mutate(n = as.numeric(n)) %>%
  group_by(sample) %>%
  mutate(total = as.numeric(sum(n))) %>%
  mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female"))
}

# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1")))

samples_obs <- left_join(samples, observed, by=c("sample","gt")) %>% mutate(sex = case_when(
    grepl("son", sample) ~ "male",
    grepl("dat", sample) ~ "female")) %>%
    group_by(sample) %>%
    replace(is.na(.), 0) %>%
    mutate(total = as.numeric(sum(n))) %>%
    dplyr::mutate(prop = n/total) 
  
pooled_obs_count =  samples_obs %>% 
    dplyr::filter(total>=10) %>%
    filter(gt =="0/1") %>%
    group_by(sex) %>% 
    mutate(sites = sum(total)) %>% 
    select(c(sex,sites)) %>% 
    unique()

pooled_obs_count
## # A tibble: 2 × 2
## # Groups:   sex [2]
##   sex    sites
##   <chr>  <dbl>
## 1 male    3657
## 2 female  4618
p_01_01 = samples_obs %>% dplyr::filter(total>=10) %>%
    ggplot(aes(fill=gt, y=prop, x=sex)) + 
    geom_bar(position="fill", stat="identity", ) +
    labs(fill = "Genotype") + 
    theme_classic() +
    theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12)) +
     scale_fill_manual(values=c("#ffbf00", "#66b032","#1982c4"))+
     theme(legend.position = "none")+
  ggtitle("F1 male (0/1) x F1 female (0/1)",
          subtitle = paste("F2 female = ", filter(pooled_obs_count, sex == 'female')$sites, ", F2 male =", filter(pooled_obs_count, sex == 'male')$sites))

plot sanity check crosses

legend <- get_legend(p_00_00)   # get the legend of the first one plot

# here the plots in a grid
prow <- plot_grid( p_00_00 + theme(legend.position="none"),
           # here you add the percentage
           p_11_11 + theme(legend.position="none")+ scale_y_continuous(),
           p_11_00 + theme(legend.position="none")+ scale_y_continuous(),
          p_00_11 + theme(legend.position="none")+ scale_y_continuous(),
         align = 'v',
           #labels = c("A", "B"),
           hjust = -1,
           nrow = 2)

# here you add the legend
plot_grid( prow, legend, rel_widths = c(1, .2))

plot informative crosses

legend <- get_legend(p_01_01)   # get the legend of the first one plot

# here the plots in a grid
prow <- plot_grid( p_00_01 + theme(legend.position="none"),
           # here you add the percentage
           p_11_01 + theme(legend.position="none")+ scale_y_continuous(),
           p_01_00 + theme(legend.position="none")+ scale_y_continuous(),
          p_01_11 + theme(legend.position="none")+ scale_y_continuous(),
         align = 'v',
           #labels = c("A", "B"),
           hjust = -1,
           nrow = 2)

# here you add the legend
plot_grid( prow, legend, rel_widths = c(1, .2))