library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(scales)
library(gridExtra)

# Professional theme
theme_report <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14, hjust = 0, color = "#1a365d"),
      plot.subtitle = element_text(size = 11, hjust = 0, color = "#4a5568"),
      plot.caption = element_text(size = 9, color = "#718096"),
      legend.position = "bottom",
      panel.grid.minor = element_blank(),
      panel.grid.major = element_line(color = "#e2e8f0"),
      axis.title = element_text(face = "bold", size = 11),
      strip.text = element_text(face = "bold", size = 11),
      strip.background = element_rect(fill = "#f7fafc", color = NA)
    )
}
theme_set(theme_report())
setwd("C:/Users/SAlemu/OneDrive - CGIAR/Documents/Poultryaudit/Dataset/")
data <- read_excel("Dataset_evaluation_corrected.xlsx")
colnames(data) <- gsub(" ", "_", colnames(data))
data$Generation <- factor(data$Generation, levels = c("0", "1", "2"))

1 Overview

This report presents descriptive statistics for the WIL-20 poultry dataset. The data includes birds categorized into three groups labeled as generations (G0, G1, G2).

662
Total Birds
3
Generations
1,148g
Mean BW at 16wk
94.9%
Survival Rate
28
Egg Records

📋 Report Scope

This report provides descriptive statistics summarizing the phenotypic data collected across three groups (G0, G1, G2). The data represents birds hatched in different periods (2022, 2023, 2024).


2 Data Quality

2.1 Dataset Structure

data.frame(
  Variable = colnames(data),
  Type = sapply(data, function(x) class(x)[1]),
  Records = sapply(data, function(x) sum(!is.na(x))),
  Missing = sapply(data, function(x) sum(is.na(x))),
  Complete_Pct = round(sapply(data, function(x) sum(!is.na(x)) / length(x) * 100), 1)
) %>%
  kable(col.names = c("Variable", "Type", "Records", "Missing", "Complete %"),
        align = c("l", "l", "r", "r", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#2d3748", color = "white")
Variable Type Records Missing Complete %
ID ID character 662 0 100.0
Sire_ID Sire_ID character 398 264 60.1
Dam_ID Dam_ID character 398 264 60.1
Hatch_Date Hatch_Date character 662 0 100.0
month month numeric 662 0 100.0
year year numeric 662 0 100.0
Sex Sex character 624 38 94.3
Generation Generation factor 662 0 100.0
Breed Breed character 662 0 100.0
Batch Batch character 662 0 100.0
Cull_Date Cull_Date character 34 628 5.1
Cull_Reason Cull_Reason character 34 628 5.1
BW_0wk BW_0wk numeric 662 0 100.0
BW_8wk BW_8wk numeric 616 46 93.1
BW_12wk BW_12wk numeric 611 51 92.3
BW_16wk BW_16wk numeric 601 61 90.8
AFE_(weeks) AFE_(weeks) numeric 28 634 4.2
Egg_no_45wk Egg_no_45wk numeric 28 634 4.2
Egg_weight Egg_weight numeric 28 634 4.2

2.2 Data Completeness

completeness <- data.frame(
  Variable = colnames(data),
  Percent = sapply(data, function(x) sum(!is.na(x)) / length(x) * 100)
) %>%
  arrange(Percent) %>%
  mutate(
    Variable = factor(Variable, levels = Variable),
    Status = case_when(
      Percent >= 90 ~ "Good (≥90%)",
      Percent >= 50 ~ "Moderate (50-90%)",
      TRUE ~ "Limited (<50%)"
    )
  )

ggplot(completeness, aes(x = Variable, y = Percent, fill = Status)) +
  geom_bar(stat = "identity", width = 0.7) +
  geom_text(aes(label = paste0(round(Percent, 1), "%")), hjust = -0.1, size = 3) +
  coord_flip() +
  scale_fill_manual(values = c("Good (≥90%)" = "#38a169", "Moderate (50-90%)" = "#ed8936", "Limited (<50%)" = "#e53e3e")) +
  scale_y_continuous(limits = c(0, 115)) +
  labs(title = "Data Completeness by Variable", x = NULL, y = "Completeness (%)", fill = "Status")

📋 Data Coverage

  • Body weight traits: >90% complete
  • Egg traits: 28 records (G1), G2 recording ongoing
  • Pedigree: Complete for G1 and G2
  • Sex: 94.3% determined

3 Population Description

3.1 Distribution by Group

group_summary <- data %>%
  group_by(Generation) %>%
  summarise(
    N = n(),
    Males = sum(Sex == "M", na.rm = TRUE),
    Females = sum(Sex == "F", na.rm = TRUE),
    Unknown_Sex = sum(is.na(Sex)),
    Deaths = sum(!is.na(Cull_Reason)),
    Survival_Pct = round((1 - Deaths / N) * 100, 1),
    .groups = "drop"
  ) %>%
  mutate(Generation = paste0("G", Generation))

group_summary %>%
  kable(col.names = c("Group", "Total", "Males", "Females", "Unknown Sex", "Deaths", "Survival %"),
        align = c("l", rep("r", 6))) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#2d3748", color = "white")
Group Total Males Females Unknown Sex Deaths Survival %
G0 264 120 117 27 23 91.3
G1 92 51 37 4 3 96.7
G2 306 164 135 7 8 97.4
plot_data <- data %>%
  filter(!is.na(Sex)) %>%
  group_by(Generation, Sex) %>%
  summarise(n = n(), .groups = "drop") %>%
  mutate(Generation = paste0("G", Generation))

ggplot(plot_data, aes(x = Generation, y = n, fill = Sex)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  geom_text(aes(label = n), position = position_dodge(width = 0.7), vjust = -0.5, size = 4) +
  scale_fill_manual(values = c("F" = "#ed64a6", "M" = "#4299e1"), labels = c("Female", "Male")) +
  labs(title = "Number of Birds by Group and Sex", x = "Group", y = "Count", fill = NULL) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)))

3.2 Pedigree Information

pedigree_summary <- data %>%
  group_by(Generation) %>%
  summarise(
    N = n(),
    Sire_Known = sum(!is.na(Sire_ID)),
    Dam_Known = sum(!is.na(Dam_ID)),
    Sire_Pct = round(Sire_Known / N * 100, 1),
    Dam_Pct = round(Dam_Known / N * 100, 1),
    .groups = "drop"
  ) %>%
  mutate(Generation = paste0("G", Generation))

pedigree_summary %>%
  kable(col.names = c("Group", "N", "Sire Known", "Dam Known", "Sire %", "Dam %"),
        align = c("l", rep("r", 5))) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2d3748", color = "white") %>%
  row_spec(1, background = "#fff5f5")
Group N Sire Known Dam Known Sire % Dam %
G0 264 0 0 0 0
G1 92 92 92 100 100
G2 306 306 306 100 100

📋 Pedigree Status

  • G0: Foundation stock (pedigree not recorded)
  • G1 and G2: Complete pedigree information available

4 Body Weight Descriptive Statistics

4.1 Overall Summary

bw_summary <- data.frame(
  Trait = c("Hatch Weight (BW_0wk)", "8-Week Weight (BW_8wk)", "12-Week Weight (BW_12wk)", "16-Week Weight (BW_16wk)"),
  N = c(sum(!is.na(data$BW_0wk)), sum(!is.na(data$BW_8wk)), 
        sum(!is.na(data$BW_12wk)), sum(!is.na(data$BW_16wk))),
  Mean = c(mean(data$BW_0wk, na.rm=T), mean(data$BW_8wk, na.rm=T),
           mean(data$BW_12wk, na.rm=T), mean(data$BW_16wk, na.rm=T)),
  SD = c(sd(data$BW_0wk, na.rm=T), sd(data$BW_8wk, na.rm=T),
         sd(data$BW_12wk, na.rm=T), sd(data$BW_16wk, na.rm=T)),
  Min = c(min(data$BW_0wk, na.rm=T), min(data$BW_8wk, na.rm=T),
          min(data$BW_12wk, na.rm=T), min(data$BW_16wk, na.rm=T)),
  Max = c(max(data$BW_0wk, na.rm=T), max(data$BW_8wk, na.rm=T),
          max(data$BW_12wk, na.rm=T), max(data$BW_16wk, na.rm=T))
) %>%
  mutate(
    CV = round(SD / Mean * 100, 1),
    Mean = round(Mean, 1),
    SD = round(SD, 1)
  )

bw_summary %>%
  kable(col.names = c("Trait", "N", "Mean (g)", "SD", "Min", "Max", "CV %"),
        align = c("l", rep("r", 6))) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#3182ce", color = "white")
Trait N Mean (g) SD Min Max CV %
Hatch Weight (BW_0wk) 662 35.7 14.5 25.0 393.0 40.5
8-Week Weight (BW_8wk) 616 451.3 115.0 113.0 984.1 25.5
12-Week Weight (BW_12wk) 611 785.3 195.9 224.3 1648.7 24.9
16-Week Weight (BW_16wk) 601 1148.4 272.0 359.2 1875.0 23.7

📋 Note

BW_0wk maximum value of 393g appears high for day-old chick weight (typical range: 25-45g). This may need verification.

4.2 Body Weight by Sex

data %>%
  filter(Sex %in% c("M", "F")) %>%
  group_by(Sex) %>%
  summarise(
    N = n(),
    BW_0wk = round(mean(BW_0wk, na.rm = TRUE), 1),
    BW_8wk = round(mean(BW_8wk, na.rm = TRUE), 1),
    BW_12wk = round(mean(BW_12wk, na.rm = TRUE), 1),
    BW_16wk = round(mean(BW_16wk, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Sex = ifelse(Sex == "M", "Male", "Female")) %>%
  kable(col.names = c("Sex", "N", "Hatch (g)", "8wk (g)", "12wk (g)", "16wk (g)"),
        align = c("l", rep("r", 5))) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2d3748", color = "white")
Sex N Hatch (g) 8wk (g) 12wk (g) 16wk (g)
Female 289 35.8 414.9 702.8 995.6
Male 335 35.6 481.7 855.2 1282.2
bw_sex <- data %>%
  filter(Sex %in% c("M", "F")) %>%
  select(Sex, BW_0wk, BW_8wk, BW_12wk, BW_16wk) %>%
  pivot_longer(-Sex, names_to = "Age", values_to = "Weight") %>%
  filter(!is.na(Weight)) %>%
  mutate(
    Age_wk = as.numeric(gsub("BW_|wk", "", Age)),
    Sex = ifelse(Sex == "M", "Male", "Female")
  ) %>%
  group_by(Sex, Age_wk) %>%
  summarise(Mean = mean(Weight), SE = sd(Weight)/sqrt(n()), .groups = "drop")

ggplot(bw_sex, aes(x = Age_wk, y = Mean, color = Sex)) +
  geom_ribbon(aes(ymin = Mean - SE, ymax = Mean + SE, fill = Sex), alpha = 0.2, color = NA) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  scale_color_manual(values = c("Female" = "#ed64a6", "Male" = "#4299e1")) +
  scale_fill_manual(values = c("Female" = "#ed64a6", "Male" = "#4299e1")) +
  scale_x_continuous(breaks = c(0, 8, 12, 16)) +
  labs(
    title = "Mean Body Weight by Sex",
    subtitle = "Shaded area represents ± 1 standard error",
    x = "Age (weeks)", y = "Body Weight (g)", color = NULL, fill = NULL
  )

4.3 Body Weight by Group

bw_by_group <- data %>%
  group_by(Generation) %>%
  summarise(
    N = n(),
    BW_0wk = round(mean(BW_0wk, na.rm = TRUE), 1),
    BW_8wk = round(mean(BW_8wk, na.rm = TRUE), 1),
    BW_12wk = round(mean(BW_12wk, na.rm = TRUE), 1),
    BW_16wk = round(mean(BW_16wk, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Generation = paste0("G", Generation))

bw_by_group %>%
  kable(col.names = c("Group", "N", "Hatch (g)", "8wk (g)", "12wk (g)", "16wk (g)"),
        align = c("l", rep("r", 5))) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#805ad5", color = "white")
Group N Hatch (g) 8wk (g) 12wk (g) 16wk (g)
G0 264 35.0 410.5 726.0 1085.2
G1 92 36.8 440.8 824.4 1111.9
G2 306 36.1 489.5 823.7 1213.8
bw_group_long <- data %>%
  group_by(Generation) %>%
  summarise(
    BW_0wk = mean(BW_0wk, na.rm = TRUE),
    BW_8wk = mean(BW_8wk, na.rm = TRUE),
    BW_12wk = mean(BW_12wk, na.rm = TRUE),
    BW_16wk = mean(BW_16wk, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(-Generation, names_to = "Trait", values_to = "Mean") %>%
  mutate(
    Generation = paste0("G", Generation),
    Trait = factor(Trait, levels = c("BW_0wk", "BW_8wk", "BW_12wk", "BW_16wk"),
                   labels = c("Hatch", "8 weeks", "12 weeks", "16 weeks"))
  )

ggplot(bw_group_long, aes(x = Generation, y = Mean, fill = Generation)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(label = round(Mean, 0)), vjust = -0.5, size = 3.5) +
  facet_wrap(~Trait, scales = "free_y") +
  scale_fill_manual(values = c("#3182ce", "#38a169", "#805ad5")) +
  labs(
    title = "Mean Body Weight by Generation",
    x = "Generation", y = "Mean Body Weight (g)"
  ) +
  theme(legend.position = "none") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)))

📊 Summary: Mean Body Weight at 16 Weeks by Group

Group Mean BW_16wk Difference from G0
G0 1,085g
G1 1,112g +27g
G2 1,214g +129g

5 Egg Production (Limited Data)

📋 Note

Egg production data is currently available for 28 birds from G1. G2 egg recording is ongoing.

egg_data <- data %>% filter(!is.na(`AFE_(weeks)`))

if(nrow(egg_data) > 0) {
  data.frame(
    Trait = c("Age at First Egg (weeks)", "Egg Number at 45 weeks", "Egg Weight (g)"),
    N = c(sum(!is.na(egg_data$`AFE_(weeks)`)),
          sum(!is.na(egg_data$Egg_no_45wk)),
          sum(!is.na(egg_data$Egg_weight))),
    Mean = c(round(mean(egg_data$`AFE_(weeks)`, na.rm=T), 1),
             round(mean(egg_data$Egg_no_45wk, na.rm=T), 1),
             round(mean(egg_data$Egg_weight, na.rm=T), 1)),
    SD = c(round(sd(egg_data$`AFE_(weeks)`, na.rm=T), 2),
           round(sd(egg_data$Egg_no_45wk, na.rm=T), 1),
           round(sd(egg_data$Egg_weight, na.rm=T), 2)),
    Min = c(min(egg_data$`AFE_(weeks)`, na.rm=T),
            min(egg_data$Egg_no_45wk, na.rm=T),
            round(min(egg_data$Egg_weight, na.rm=T), 1)),
    Max = c(max(egg_data$`AFE_(weeks)`, na.rm=T),
            max(egg_data$Egg_no_45wk, na.rm=T),
            round(max(egg_data$Egg_weight, na.rm=T), 1))
  ) %>%
    kable(col.names = c("Trait", "N", "Mean", "SD", "Min", "Max"),
          align = c("l", rep("r", 5))) %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
    row_spec(0, bold = TRUE, background = "#805ad5", color = "white")
}
Trait N Mean SD Min Max
Age at First Egg (weeks) 28 24.2 2.18 20.0 28
Egg Number at 45 weeks 28 98.8 25.40 35.0 144
Egg Weight (g) 28 52.8 3.57 47.6 61
if(nrow(egg_data) > 0) {
  p1 <- ggplot(egg_data, aes(x = `AFE_(weeks)`)) +
    geom_histogram(fill = "#805ad5", color = "white", bins = 8, alpha = 0.8) +
    labs(title = "Age at First Egg", x = "Weeks", y = "Count")
  
  p2 <- ggplot(egg_data, aes(x = Egg_no_45wk)) +
    geom_histogram(fill = "#ed64a6", color = "white", bins = 8, alpha = 0.8) +
    labs(title = "Egg Number at 45 Weeks", x = "Eggs", y = "Count")
  
  p3 <- ggplot(egg_data, aes(x = Egg_weight)) +
    geom_histogram(fill = "#ed8936", color = "white", bins = 8, alpha = 0.8) +
    labs(title = "Egg Weight", x = "Weight (g)", y = "Count")
  
  grid.arrange(p1, p2, p3, ncol = 3)
}


6 Survival Summary

survival <- data %>%
  group_by(Generation) %>%
  summarise(
    Total = n(),
    Deaths = sum(!is.na(Cull_Reason)),
    Survived = Total - Deaths,
    Survival_Pct = round(Survived / Total * 100, 1),
    .groups = "drop"
  ) %>%
  mutate(Generation = paste0("G", Generation))

survival %>%
  bind_rows(
    data.frame(Generation = "Overall", Total = sum(survival$Total), 
               Deaths = sum(survival$Deaths), Survived = sum(survival$Survived),
               Survival_Pct = round(sum(survival$Survived)/sum(survival$Total)*100, 1))
  ) %>%
  kable(col.names = c("Group", "Total", "Deaths", "Survived", "Survival %"),
        align = c("l", rep("r", 4))) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2d3748", color = "white") %>%
  row_spec(4, bold = TRUE, background = "#f7fafc")
Group Total Deaths Survived Survival %
G0 264 23 241 91.3
G1 92 3 89 96.7
G2 306 8 298 97.4
Overall 662 34 628 94.9
ggplot(survival, aes(x = Generation, y = Survival_Pct, fill = Generation)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = paste0(Survival_Pct, "%")), vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("#3182ce", "#38a169", "#805ad5")) +
  scale_y_continuous(limits = c(0, 105)) +
  labs(title = "Survival Rate by Group", x = "Group", y = "Survival Rate (%)") +
  theme(legend.position = "none")


7 Summary

📋 Key Findings

Measure Value
Total birds 662
Groups 3 (G0, G1, G2)
Mean BW at 16 weeks 1,148g (overall)
Overall survival 94.9%
Egg records available 28 (G1)