library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(scales)
library(gridExtra)
# Professional theme
theme_report <- function() {
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0, color = "#1a365d"),
plot.subtitle = element_text(size = 11, hjust = 0, color = "#4a5568"),
plot.caption = element_text(size = 9, color = "#718096"),
legend.position = "bottom",
panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "#e2e8f0"),
axis.title = element_text(face = "bold", size = 11),
strip.text = element_text(face = "bold", size = 11),
strip.background = element_rect(fill = "#f7fafc", color = NA)
)
}
theme_set(theme_report())setwd("C:/Users/SAlemu/OneDrive - CGIAR/Documents/Poultryaudit/Dataset/")
data <- read_excel("Dataset_evaluation_corrected.xlsx")
colnames(data) <- gsub(" ", "_", colnames(data))
data$Generation <- factor(data$Generation, levels = c("0", "1", "2"))This report presents descriptive statistics for the WIL-20 poultry dataset. The data includes birds categorized into three groups labeled as generations (G0, G1, G2).
This report provides descriptive statistics summarizing the phenotypic data collected across three groups (G0, G1, G2). The data represents birds hatched in different periods (2022, 2023, 2024).
data.frame(
Variable = colnames(data),
Type = sapply(data, function(x) class(x)[1]),
Records = sapply(data, function(x) sum(!is.na(x))),
Missing = sapply(data, function(x) sum(is.na(x))),
Complete_Pct = round(sapply(data, function(x) sum(!is.na(x)) / length(x) * 100), 1)
) %>%
kable(col.names = c("Variable", "Type", "Records", "Missing", "Complete %"),
align = c("l", "l", "r", "r", "r")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
row_spec(0, bold = TRUE, background = "#2d3748", color = "white")| Variable | Type | Records | Missing | Complete % | |
|---|---|---|---|---|---|
| ID | ID | character | 662 | 0 | 100.0 |
| Sire_ID | Sire_ID | character | 398 | 264 | 60.1 |
| Dam_ID | Dam_ID | character | 398 | 264 | 60.1 |
| Hatch_Date | Hatch_Date | character | 662 | 0 | 100.0 |
| month | month | numeric | 662 | 0 | 100.0 |
| year | year | numeric | 662 | 0 | 100.0 |
| Sex | Sex | character | 624 | 38 | 94.3 |
| Generation | Generation | factor | 662 | 0 | 100.0 |
| Breed | Breed | character | 662 | 0 | 100.0 |
| Batch | Batch | character | 662 | 0 | 100.0 |
| Cull_Date | Cull_Date | character | 34 | 628 | 5.1 |
| Cull_Reason | Cull_Reason | character | 34 | 628 | 5.1 |
| BW_0wk | BW_0wk | numeric | 662 | 0 | 100.0 |
| BW_8wk | BW_8wk | numeric | 616 | 46 | 93.1 |
| BW_12wk | BW_12wk | numeric | 611 | 51 | 92.3 |
| BW_16wk | BW_16wk | numeric | 601 | 61 | 90.8 |
| AFE_(weeks) | AFE_(weeks) | numeric | 28 | 634 | 4.2 |
| Egg_no_45wk | Egg_no_45wk | numeric | 28 | 634 | 4.2 |
| Egg_weight | Egg_weight | numeric | 28 | 634 | 4.2 |
completeness <- data.frame(
Variable = colnames(data),
Percent = sapply(data, function(x) sum(!is.na(x)) / length(x) * 100)
) %>%
arrange(Percent) %>%
mutate(
Variable = factor(Variable, levels = Variable),
Status = case_when(
Percent >= 90 ~ "Good (≥90%)",
Percent >= 50 ~ "Moderate (50-90%)",
TRUE ~ "Limited (<50%)"
)
)
ggplot(completeness, aes(x = Variable, y = Percent, fill = Status)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = paste0(round(Percent, 1), "%")), hjust = -0.1, size = 3) +
coord_flip() +
scale_fill_manual(values = c("Good (≥90%)" = "#38a169", "Moderate (50-90%)" = "#ed8936", "Limited (<50%)" = "#e53e3e")) +
scale_y_continuous(limits = c(0, 115)) +
labs(title = "Data Completeness by Variable", x = NULL, y = "Completeness (%)", fill = "Status")group_summary <- data %>%
group_by(Generation) %>%
summarise(
N = n(),
Males = sum(Sex == "M", na.rm = TRUE),
Females = sum(Sex == "F", na.rm = TRUE),
Unknown_Sex = sum(is.na(Sex)),
Deaths = sum(!is.na(Cull_Reason)),
Survival_Pct = round((1 - Deaths / N) * 100, 1),
.groups = "drop"
) %>%
mutate(Generation = paste0("G", Generation))
group_summary %>%
kable(col.names = c("Group", "Total", "Males", "Females", "Unknown Sex", "Deaths", "Survival %"),
align = c("l", rep("r", 6))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
row_spec(0, bold = TRUE, background = "#2d3748", color = "white")| Group | Total | Males | Females | Unknown Sex | Deaths | Survival % |
|---|---|---|---|---|---|---|
| G0 | 264 | 120 | 117 | 27 | 23 | 91.3 |
| G1 | 92 | 51 | 37 | 4 | 3 | 96.7 |
| G2 | 306 | 164 | 135 | 7 | 8 | 97.4 |
plot_data <- data %>%
filter(!is.na(Sex)) %>%
group_by(Generation, Sex) %>%
summarise(n = n(), .groups = "drop") %>%
mutate(Generation = paste0("G", Generation))
ggplot(plot_data, aes(x = Generation, y = n, fill = Sex)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -0.5, size = 4) +
scale_fill_manual(values = c("F" = "#ed64a6", "M" = "#4299e1"), labels = c("Female", "Male")) +
labs(title = "Number of Birds by Group and Sex", x = "Group", y = "Count", fill = NULL) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15)))pedigree_summary <- data %>%
group_by(Generation) %>%
summarise(
N = n(),
Sire_Known = sum(!is.na(Sire_ID)),
Dam_Known = sum(!is.na(Dam_ID)),
Sire_Pct = round(Sire_Known / N * 100, 1),
Dam_Pct = round(Dam_Known / N * 100, 1),
.groups = "drop"
) %>%
mutate(Generation = paste0("G", Generation))
pedigree_summary %>%
kable(col.names = c("Group", "N", "Sire Known", "Dam Known", "Sire %", "Dam %"),
align = c("l", rep("r", 5))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2d3748", color = "white") %>%
row_spec(1, background = "#fff5f5")| Group | N | Sire Known | Dam Known | Sire % | Dam % |
|---|---|---|---|---|---|
| G0 | 264 | 0 | 0 | 0 | 0 |
| G1 | 92 | 92 | 92 | 100 | 100 |
| G2 | 306 | 306 | 306 | 100 | 100 |
bw_summary <- data.frame(
Trait = c("Hatch Weight (BW_0wk)", "8-Week Weight (BW_8wk)", "12-Week Weight (BW_12wk)", "16-Week Weight (BW_16wk)"),
N = c(sum(!is.na(data$BW_0wk)), sum(!is.na(data$BW_8wk)),
sum(!is.na(data$BW_12wk)), sum(!is.na(data$BW_16wk))),
Mean = c(mean(data$BW_0wk, na.rm=T), mean(data$BW_8wk, na.rm=T),
mean(data$BW_12wk, na.rm=T), mean(data$BW_16wk, na.rm=T)),
SD = c(sd(data$BW_0wk, na.rm=T), sd(data$BW_8wk, na.rm=T),
sd(data$BW_12wk, na.rm=T), sd(data$BW_16wk, na.rm=T)),
Min = c(min(data$BW_0wk, na.rm=T), min(data$BW_8wk, na.rm=T),
min(data$BW_12wk, na.rm=T), min(data$BW_16wk, na.rm=T)),
Max = c(max(data$BW_0wk, na.rm=T), max(data$BW_8wk, na.rm=T),
max(data$BW_12wk, na.rm=T), max(data$BW_16wk, na.rm=T))
) %>%
mutate(
CV = round(SD / Mean * 100, 1),
Mean = round(Mean, 1),
SD = round(SD, 1)
)
bw_summary %>%
kable(col.names = c("Trait", "N", "Mean (g)", "SD", "Min", "Max", "CV %"),
align = c("l", rep("r", 6))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
row_spec(0, bold = TRUE, background = "#3182ce", color = "white")| Trait | N | Mean (g) | SD | Min | Max | CV % |
|---|---|---|---|---|---|---|
| Hatch Weight (BW_0wk) | 662 | 35.7 | 14.5 | 25.0 | 393.0 | 40.5 |
| 8-Week Weight (BW_8wk) | 616 | 451.3 | 115.0 | 113.0 | 984.1 | 25.5 |
| 12-Week Weight (BW_12wk) | 611 | 785.3 | 195.9 | 224.3 | 1648.7 | 24.9 |
| 16-Week Weight (BW_16wk) | 601 | 1148.4 | 272.0 | 359.2 | 1875.0 | 23.7 |
BW_0wk maximum value of 393g appears high for day-old chick weight (typical range: 25-45g). This may need verification.
data %>%
filter(Sex %in% c("M", "F")) %>%
group_by(Sex) %>%
summarise(
N = n(),
BW_0wk = round(mean(BW_0wk, na.rm = TRUE), 1),
BW_8wk = round(mean(BW_8wk, na.rm = TRUE), 1),
BW_12wk = round(mean(BW_12wk, na.rm = TRUE), 1),
BW_16wk = round(mean(BW_16wk, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Sex = ifelse(Sex == "M", "Male", "Female")) %>%
kable(col.names = c("Sex", "N", "Hatch (g)", "8wk (g)", "12wk (g)", "16wk (g)"),
align = c("l", rep("r", 5))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2d3748", color = "white")| Sex | N | Hatch (g) | 8wk (g) | 12wk (g) | 16wk (g) |
|---|---|---|---|---|---|
| Female | 289 | 35.8 | 414.9 | 702.8 | 995.6 |
| Male | 335 | 35.6 | 481.7 | 855.2 | 1282.2 |
bw_sex <- data %>%
filter(Sex %in% c("M", "F")) %>%
select(Sex, BW_0wk, BW_8wk, BW_12wk, BW_16wk) %>%
pivot_longer(-Sex, names_to = "Age", values_to = "Weight") %>%
filter(!is.na(Weight)) %>%
mutate(
Age_wk = as.numeric(gsub("BW_|wk", "", Age)),
Sex = ifelse(Sex == "M", "Male", "Female")
) %>%
group_by(Sex, Age_wk) %>%
summarise(Mean = mean(Weight), SE = sd(Weight)/sqrt(n()), .groups = "drop")
ggplot(bw_sex, aes(x = Age_wk, y = Mean, color = Sex)) +
geom_ribbon(aes(ymin = Mean - SE, ymax = Mean + SE, fill = Sex), alpha = 0.2, color = NA) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
scale_color_manual(values = c("Female" = "#ed64a6", "Male" = "#4299e1")) +
scale_fill_manual(values = c("Female" = "#ed64a6", "Male" = "#4299e1")) +
scale_x_continuous(breaks = c(0, 8, 12, 16)) +
labs(
title = "Mean Body Weight by Sex",
subtitle = "Shaded area represents ± 1 standard error",
x = "Age (weeks)", y = "Body Weight (g)", color = NULL, fill = NULL
)bw_by_group <- data %>%
group_by(Generation) %>%
summarise(
N = n(),
BW_0wk = round(mean(BW_0wk, na.rm = TRUE), 1),
BW_8wk = round(mean(BW_8wk, na.rm = TRUE), 1),
BW_12wk = round(mean(BW_12wk, na.rm = TRUE), 1),
BW_16wk = round(mean(BW_16wk, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Generation = paste0("G", Generation))
bw_by_group %>%
kable(col.names = c("Group", "N", "Hatch (g)", "8wk (g)", "12wk (g)", "16wk (g)"),
align = c("l", rep("r", 5))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#805ad5", color = "white")| Group | N | Hatch (g) | 8wk (g) | 12wk (g) | 16wk (g) |
|---|---|---|---|---|---|
| G0 | 264 | 35.0 | 410.5 | 726.0 | 1085.2 |
| G1 | 92 | 36.8 | 440.8 | 824.4 | 1111.9 |
| G2 | 306 | 36.1 | 489.5 | 823.7 | 1213.8 |
bw_group_long <- data %>%
group_by(Generation) %>%
summarise(
BW_0wk = mean(BW_0wk, na.rm = TRUE),
BW_8wk = mean(BW_8wk, na.rm = TRUE),
BW_12wk = mean(BW_12wk, na.rm = TRUE),
BW_16wk = mean(BW_16wk, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_longer(-Generation, names_to = "Trait", values_to = "Mean") %>%
mutate(
Generation = paste0("G", Generation),
Trait = factor(Trait, levels = c("BW_0wk", "BW_8wk", "BW_12wk", "BW_16wk"),
labels = c("Hatch", "8 weeks", "12 weeks", "16 weeks"))
)
ggplot(bw_group_long, aes(x = Generation, y = Mean, fill = Generation)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = round(Mean, 0)), vjust = -0.5, size = 3.5) +
facet_wrap(~Trait, scales = "free_y") +
scale_fill_manual(values = c("#3182ce", "#38a169", "#805ad5")) +
labs(
title = "Mean Body Weight by Generation",
x = "Generation", y = "Mean Body Weight (g)"
) +
theme(legend.position = "none") +
scale_y_continuous(expand = expansion(mult = c(0, 0.15)))| Group | Mean BW_16wk | Difference from G0 |
|---|---|---|
| G0 | 1,085g | — |
| G1 | 1,112g | +27g |
| G2 | 1,214g | +129g |
Egg production data is currently available for 28 birds from G1. G2 egg recording is ongoing.
egg_data <- data %>% filter(!is.na(`AFE_(weeks)`))
if(nrow(egg_data) > 0) {
data.frame(
Trait = c("Age at First Egg (weeks)", "Egg Number at 45 weeks", "Egg Weight (g)"),
N = c(sum(!is.na(egg_data$`AFE_(weeks)`)),
sum(!is.na(egg_data$Egg_no_45wk)),
sum(!is.na(egg_data$Egg_weight))),
Mean = c(round(mean(egg_data$`AFE_(weeks)`, na.rm=T), 1),
round(mean(egg_data$Egg_no_45wk, na.rm=T), 1),
round(mean(egg_data$Egg_weight, na.rm=T), 1)),
SD = c(round(sd(egg_data$`AFE_(weeks)`, na.rm=T), 2),
round(sd(egg_data$Egg_no_45wk, na.rm=T), 1),
round(sd(egg_data$Egg_weight, na.rm=T), 2)),
Min = c(min(egg_data$`AFE_(weeks)`, na.rm=T),
min(egg_data$Egg_no_45wk, na.rm=T),
round(min(egg_data$Egg_weight, na.rm=T), 1)),
Max = c(max(egg_data$`AFE_(weeks)`, na.rm=T),
max(egg_data$Egg_no_45wk, na.rm=T),
round(max(egg_data$Egg_weight, na.rm=T), 1))
) %>%
kable(col.names = c("Trait", "N", "Mean", "SD", "Min", "Max"),
align = c("l", rep("r", 5))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#805ad5", color = "white")
}| Trait | N | Mean | SD | Min | Max |
|---|---|---|---|---|---|
| Age at First Egg (weeks) | 28 | 24.2 | 2.18 | 20.0 | 28 |
| Egg Number at 45 weeks | 28 | 98.8 | 25.40 | 35.0 | 144 |
| Egg Weight (g) | 28 | 52.8 | 3.57 | 47.6 | 61 |
if(nrow(egg_data) > 0) {
p1 <- ggplot(egg_data, aes(x = `AFE_(weeks)`)) +
geom_histogram(fill = "#805ad5", color = "white", bins = 8, alpha = 0.8) +
labs(title = "Age at First Egg", x = "Weeks", y = "Count")
p2 <- ggplot(egg_data, aes(x = Egg_no_45wk)) +
geom_histogram(fill = "#ed64a6", color = "white", bins = 8, alpha = 0.8) +
labs(title = "Egg Number at 45 Weeks", x = "Eggs", y = "Count")
p3 <- ggplot(egg_data, aes(x = Egg_weight)) +
geom_histogram(fill = "#ed8936", color = "white", bins = 8, alpha = 0.8) +
labs(title = "Egg Weight", x = "Weight (g)", y = "Count")
grid.arrange(p1, p2, p3, ncol = 3)
}survival <- data %>%
group_by(Generation) %>%
summarise(
Total = n(),
Deaths = sum(!is.na(Cull_Reason)),
Survived = Total - Deaths,
Survival_Pct = round(Survived / Total * 100, 1),
.groups = "drop"
) %>%
mutate(Generation = paste0("G", Generation))
survival %>%
bind_rows(
data.frame(Generation = "Overall", Total = sum(survival$Total),
Deaths = sum(survival$Deaths), Survived = sum(survival$Survived),
Survival_Pct = round(sum(survival$Survived)/sum(survival$Total)*100, 1))
) %>%
kable(col.names = c("Group", "Total", "Deaths", "Survived", "Survival %"),
align = c("l", rep("r", 4))) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2d3748", color = "white") %>%
row_spec(4, bold = TRUE, background = "#f7fafc")| Group | Total | Deaths | Survived | Survival % |
|---|---|---|---|---|
| G0 | 264 | 23 | 241 | 91.3 |
| G1 | 92 | 3 | 89 | 96.7 |
| G2 | 306 | 8 | 298 | 97.4 |
| Overall | 662 | 34 | 628 | 94.9 |
ggplot(survival, aes(x = Generation, y = Survival_Pct, fill = Generation)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = paste0(Survival_Pct, "%")), vjust = -0.5, size = 5, fontface = "bold") +
scale_fill_manual(values = c("#3182ce", "#38a169", "#805ad5")) +
scale_y_continuous(limits = c(0, 105)) +
labs(title = "Survival Rate by Group", x = "Group", y = "Survival Rate (%)") +
theme(legend.position = "none")| Measure | Value |
|---|---|
| Total birds | 662 |
| Groups | 3 (G0, G1, G2) |
| Mean BW at 16 weeks | 1,148g (overall) |
| Overall survival | 94.9% |
| Egg records available | 28 (G1) |
Complete G2 egg production recording — Currently ongoing, will provide full reproductive dataset
Data verification — Check BW_0wk outlier (393g may be a typo for 39.3g)
Pedigree documentation — Consolidating pedigree records for all groups would enable more advanced analyses in the future