library("tidyverse")
library("plyr")
library("dplyr")
library("ggplot2")
library("scales")
library("ggpubr")
library("gridExtra")
library("grid")
library("GGally")
library("vcfR") # for extracting genotype data from a vcf file
library("data.table")
library("stringr")
library("janitor")
library("plotly") # for the 3d plots
library("ggfortify")
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
fig.width = 10,
fig.asp = 0.6,
out.width = "100%")
Extract genotypes for each site and individual. The metadata for all samples can be found in here.
vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE )
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## ***** ***** *****
# extract the genotype for each site in each individual
gt <- extract.gt(vcf, element = "GT")
gt <- as.data.frame(t(gt)) %>%
rownames_to_column("ID")
#clean the ID names
gt$ID <- sub("_[^_]+$", "", gt$ID)
table <- gt %>%
t() %>%
as.data.frame() %>%
row_to_names(row_number = 1) # %>%
# dplyr::select(-c("48_49e_grn", "48_49f_grn", "48_49g_grn")) # remove samples: 48_49e_grn, 48_49f_grn, 48_49g_grn. they were born in a second cycle, and in the first cycle an adult male was born in F2. so i cannot tell for sure who is the father
read the meta data table and set the family
meta <- read_csv("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/meta_more.csv")
# set the families
family = grep("fnd",gt$ID, value=TRUE) %>%
str_extract("[^_]+") %>%
unique()
we expect all F2 offspring to be homozygotic (0/0) like their parents (F1)
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% #dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (0/0) x female (0/0)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
we expect all F2 offspring to be homozygotic (1/1) like their parents (F1)
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% #dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (1/1) x female (1/1)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
We expect zero sites for this cross, since there is no paternal
inheritance to the males.
Indeed, there are only 25 sites in the F2 offspring, and the genotype
distribution is random.
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% #dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (0/0) x female (1/1)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
We expect zero sites for this cross, since there is no paternal
inheritance to the males.
Indeed, there are only 24 sites in the F2 pooled females, and 15 for the
F2 pooled males:
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (1/1) x female (0/0)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
only family 63 has sites in F2 for this cross.
something is wrong…
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (0/0) x female (0/1)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (1/1) x female (0/1)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (0/1) x female (0/0)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (0/1) x female (1/1)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
# Prepare table with observed and expected counts, per family:
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>%
dplyr::select(contains("grn")) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::rename(sample = name, gt = value) %>%
dplyr::count(sample, gt, .drop = FALSE) %>%
dplyr::filter(gt %in% c("0/0", "1/1", "0/1")) %>%
mutate(n = as.numeric(n)) %>%
group_by(sample) %>%
# mutate(total = as.numeric(sum(n))) %>%
dplyr::rename(obs = n)
}
# bind all families together, to a final data frame containing all observed counts
observed <- do.call("rbind", obs) %>% mutate(sample = as.character(sample))
# replace "NA" with zero value when gt doesn't exists
samples <- data.frame(sample = rep(unique(observed$sample), each =3), gt = rep(c("0/0", "0/1", "1/1"))) # add all possible genotypes
df <- left_join(samples, observed, by=c("sample","gt")) %>%
replace(is.na(.),0) %>% # replace "NA" with zero value when gt doesn't exists
group_by(sample) %>%
mutate(total = as.numeric(sum(obs))) %>%
mutate(freq=round(obs/total, digit=2)) %>%
left_join(meta, by="sample") %>%
dplyr::rename(count = obs)
# order the stages
df$sex <- factor(df$sex, level=c("female", "male", "Nymph", "Egg"))
p = df %>% dplyr::filter(total>10) %>%
ggplot(aes(x = sex, y = freq, colour = gt, shape = adult_sisters,
text = paste("sample:", sample,
", n:", total))) +
geom_boxplot(outlier.shape = NA, coef=0 ) +
geom_jitter(width=0.1, size=2) +
scale_color_manual(values=c("#ffbf00", "#66b032","#1982c4")) +
theme_bw() +
scale_shape_manual(values=c(1,19)) +
ggtitle('F2 offspring of F1 male (0/1) x female (0/1)') +
facet_wrap( ~gt) + guides(color = "none") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))