library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)

# ── Custom ggplot2 theme matching the report palette (colorblind-friendly) ──
theme_report <- theme_minimal(base_size = 13) +
  theme(
    plot.title       = element_text(face = "bold", size = 15, color = "#2166AC"),
    plot.subtitle    = element_text(color = "#6c757d", size = 11),
    plot.background  = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA),
    panel.grid.major = element_line(color = "#f0f0f0", linewidth = 0.4),
    panel.grid.minor = element_blank(),
    axis.title       = element_text(size = 11, color = "#495057", face = "bold"),
    axis.text        = element_text(size = 10, color = "#6c757d"),
    legend.position  = "top",
    legend.title     = element_text(face = "bold", size = 10),
    legend.text      = element_text(size = 10),
    strip.text       = element_text(face = "bold", size = 11, color = "#2166AC"),
    strip.background = element_rect(fill = "#D1E5F0", color = NA)
  )
theme_set(theme_report)

# ── Colorblind-friendly palettes (RdBu / Blue-Orange diverging) ──
pal_primary <- c("#2166AC", "#4393C3", "#67A9CF", "#92C5DE", "#D1E5F0")
pal_accent  <- c("#D6604D", "#F4A582", "#FDDBC7", "#4393C3", "#2166AC")
pal_sex     <- c("F" = "#4393C3", "M" = "#D6604D")
working_dir <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/Nigeria_poultry"
setwd(working_dir)
file_path <- "AMO NIGERIA DATA.xlsx"
dat <- read_excel(file_path, sheet = 1)

dat <- dat %>%
  select(-any_of(c("INDEX", "REMARK", "FILTER_NUM", "SPOT_NUMB", 
                    "TAGNO", "SAMPLE_UNI"))) %>%
  mutate(
    SIRENO  = as.character(SIRENO),
    DAMNO   = as.character(DAMNO),
    GSIRENO = as.character(GSIRENO),
    WINGNO  = as.character(WINGNO),
    SEX     = as.factor(SEX),
    GCODE   = as.factor(GCODE)
  )

1 Executive Summary

This report provides a data-driven assessment of the AMO Nigeria poultry farm dataset for genetic evaluation and breeding strategy development.

n_animals <- nrow(dat)
n_females <- sum(dat$SEX == "F", na.rm = TRUE)
n_males   <- sum(dat$SEX == "M", na.rm = TRUE)
n_sires   <- length(unique(na.omit(dat$SIRENO)))
n_dams    <- length(unique(na.omit(dat$DAMNO)))
n_gsires  <- length(unique(na.omit(dat$GSIRENO)))

sires_in_data <- sum(unique(na.omit(dat$SIRENO)) %in% dat$WINGNO)
dams_in_data  <- sum(unique(na.omit(dat$DAMNO)) %in% dat$WINGNO)

# Generation assignment
gen1_sires <- dat$WINGNO[dat$WINGNO %in% dat$SIRENO]
gen1_dams  <- dat$WINGNO[dat$WINGNO %in% dat$DAMNO]
dat$generation <- ifelse(dat$WINGNO %in% c(gen1_sires, gen1_dams), "Gen1_Parent", "Gen2_Offspring")
n_gen1 <- sum(dat$generation == "Gen1_Parent")
n_gen2 <- sum(dat$generation == "Gen2_Offspring")
Parameter Value
Total birds 520
Females / Males 500 / 20
Unique sires 31
Unique dams 173
Unique grand-sires 14
Sires with own phenotype 4
Dams with own phenotype 20
Generation 1 (parents) 24
Generation 2 (offspring) 496
Generations spanned ≥ 2 (overlapping)

2 Data Overview & Structure

The dataset contains 520 records with 20 columns spanning identification, pedigree, and phenotypic variables.

2.1 Missing Values

missing_summary <- dat %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") %>%
  mutate(Pct_Missing = round(N_Missing / nrow(dat) * 100, 2)) %>%
  filter(N_Missing > 0) %>%
  arrange(desc(N_Missing))

if (nrow(missing_summary) > 0) {
  kable(missing_summary, caption = "Variables with Missing Values") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
} else {
  cat("No missing values detected.")
}
Variables with Missing Values
Variable N_Missing Pct_Missing
CAGENO 23 4.42
EW1 23 4.42
EW2 23 4.42
EW3 23 4.42
EWT 23 4.42
BW1 23 4.42
BW2 23 4.42
BW3 23 4.42
EP 23 4.42
LP 23 4.42
AFEGG 23 4.42
PP 23 4.42
GSIRENO 3 0.58
SIRENO 3 0.58
DAMNO 3 0.58

2.2 Sex Distribution

sex_tab <- dat %>%
  count(SEX, name = "Count") %>%
  mutate(Percent = round(Count / sum(Count) * 100, 2))

kable(sex_tab, caption = "Sex Distribution") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Sex Distribution
SEX Count Percent
F 500 96.15
M 20 3.85

2.3 Genetic Code Distribution

gcode_tab <- dat %>%
  count(GCODE, name = "Count") %>%
  mutate(Percent = round(Count / sum(Count) * 100, 2)) %>%
  arrange(desc(Count))

kable(gcode_tab, caption = "Genetic Code (GCODE) Distribution") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Genetic Code (GCODE) Distribution
GCODE Count Percent
K10 520 100

3 Descriptive Statistics — Production & Growth Traits

trait_cols <- c("EW1", "EW2", "EW3", "EWT", "BW1", "BW2", "BW3",
                "EP", "LP", "TE", "AFEGG", "PP")
trait_cols <- trait_cols[trait_cols %in% names(dat)]

desc_stats <- dat %>%
  select(all_of(trait_cols)) %>%
  pivot_longer(everything(), names_to = "Trait", values_to = "Value") %>%
  filter(!is.na(Value)) %>%
  group_by(Trait) %>%
  summarise(
    N      = n(),
    Mean   = round(mean(Value), 2),
    SD     = round(sd(Value), 2),
    `CV%`  = round(sd(Value) / mean(Value) * 100, 2),
    Min    = round(min(Value), 2),
    Q1     = round(quantile(Value, 0.25), 2),
    Median = round(median(Value), 2),
    Q3     = round(quantile(Value, 0.75), 2),
    Max    = round(max(Value), 2),
    .groups = "drop"
  ) %>%
  arrange(Trait)

kable(desc_stats, caption = "Descriptive Statistics for All Traits") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE) %>%
  row_spec(0, bold = TRUE)
Descriptive Statistics for All Traits
Trait N Mean SD CV% Min Q1 Median Q3 Max
AFEGG 497 149.27 10.09 6.76 125.0 143.00 149.0 156.00 200.00
BW1 497 1756.04 165.43 9.42 1193.0 1651.00 1763.0 1863.00 2168.00
BW2 497 2462.32 260.55 10.58 1108.0 2293.00 2458.0 2638.00 3398.00
BW3 497 2699.64 294.86 10.92 1385.0 2533.00 2703.0 2868.00 3508.00
EP 497 107.16 28.20 26.32 -3.0 95.00 115.0 127.00 153.00
EW1 497 43.51 2.87 6.59 32.2 41.80 43.6 45.40 56.90
EW2 497 55.00 4.29 7.80 35.0 52.50 55.0 57.80 69.00
EW3 497 60.17 6.48 10.76 32.0 57.00 60.0 63.10 108.00
EWT 497 52.80 3.40 6.44 39.9 50.80 52.9 54.90 68.80
LP 497 53.06 36.72 69.20 -3.0 14.00 66.0 86.00 108.00
PP 497 61.25 22.13 36.14 0.7 44.60 69.2 78.91 93.14
TE 520 153.14 66.19 43.22 -5.0 105.75 178.5 205.00 252.00

3.1 Egg Production Trait Distributions

egg_cols <- c("EP", "LP", "TE", "AFEGG")
egg_cols <- egg_cols[egg_cols %in% names(dat)]

# Full trait name labels
egg_labels <- c(
  "EP"    = "Early Egg Production",
  "LP"    = "Late Egg Production",
  "TE"    = "Total Egg Number",
  "AFEGG" = "Age at First Egg (days)"
)

dat %>%
  select(all_of(egg_cols)) %>%
  pivot_longer(everything(), names_to = "Trait", values_to = "Value") %>%
  filter(!is.na(Value)) %>%
  mutate(Trait = factor(Trait, levels = names(egg_labels), labels = egg_labels)) %>%
  ggplot(aes(x = Value)) +
  geom_histogram(aes(y = after_stat(density)), fill = "#4393C3", color = "white",
                 alpha = 0.85, bins = 30) +
  geom_density(color = "#D6604D", linewidth = 1, alpha = 0.7) +
  facet_wrap(~Trait, scales = "free") +
  labs(title = "Distribution of Egg Production Traits",
       subtitle = "AMO Nigeria Poultry Data — Histogram with density overlay",
       x = "Value", y = "Density")
Distribution of egg production traits.

Distribution of egg production traits.

Key observations:

  • Egg production traits exhibit high CV (26.32% for EP, 43.22% for TE) — excellent variation for selection.
  • Age at first egg averages 149.27 days (~21.3 weeks).
  • Body weights show moderate CV (9–11%), with progressive growth from BW1 to BW3.

4 Pedigree & Family Structure

4.1 Offspring per Sire

offspring_per_sire <- dat %>%
  filter(!is.na(SIRENO)) %>%
  count(SIRENO, name = "N_Offspring") %>%
  arrange(desc(N_Offspring))

sire_summary <- offspring_per_sire %>%
  summarise(
    N_Sires        = n(),
    Mean_Offspring  = round(mean(N_Offspring), 2),
    SD_Offspring    = round(sd(N_Offspring), 2),
    Min             = min(N_Offspring),
    Median          = median(N_Offspring),
    Max             = max(N_Offspring),
    Total_Offspring = sum(N_Offspring)
  )

kable(sire_summary, caption = "Summary: Offspring per Sire") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Summary: Offspring per Sire
N_Sires Mean_Offspring SD_Offspring Min Median Max Total_Offspring
31 16.68 12.63 1 17 40 517
ggplot(offspring_per_sire, aes(x = N_Offspring)) +
  geom_histogram(binwidth = 1, fill = "#4393C3", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(N_Offspring)), color = "#D6604D",
             linewidth = 1.2, linetype = "dashed") +
  annotate("text", x = mean(offspring_per_sire$N_Offspring) + 0.5,
           y = Inf, vjust = 2, hjust = -0.1,
           label = paste("Mean =", round(mean(offspring_per_sire$N_Offspring), 1)),
           color = "#D6604D", fontface = "bold", size = 4) +
  labs(title = "Distribution of Offspring per Sire",
       subtitle = paste("N sires =", nrow(offspring_per_sire)),
       x = "Number of Offspring", y = "Number of Sires")
Distribution of offspring per sire.

Distribution of offspring per sire.

kable(head(offspring_per_sire, 10), caption = "Top 10 Sires by Offspring Count") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Top 10 Sires by Offspring Count
SIRENO N_Offspring
3659 40
2538 37
4043 35
4776 35
1661 31
1211 27
1662 27
1197 25
3620 25
2853 24

4.2 Offspring per Dam

offspring_per_dam <- dat %>%
  filter(!is.na(DAMNO)) %>%
  count(DAMNO, name = "N_Offspring") %>%
  arrange(desc(N_Offspring))

dam_summary <- offspring_per_dam %>%
  summarise(
    N_Dams         = n(),
    Mean_Offspring  = round(mean(N_Offspring), 2),
    SD_Offspring    = round(sd(N_Offspring), 2),
    Min             = min(N_Offspring),
    Median          = median(N_Offspring),
    Max             = max(N_Offspring),
    Total_Offspring = sum(N_Offspring)
  )

kable(dam_summary, caption = "Summary: Offspring per Dam") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Summary: Offspring per Dam
N_Dams Mean_Offspring SD_Offspring Min Median Max Total_Offspring
173 2.99 1.79 1 3 8 517
ggplot(offspring_per_dam, aes(x = N_Offspring)) +
  geom_histogram(binwidth = 1, fill = "#D6604D", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(N_Offspring)), color = "#2166AC",
             linewidth = 1.2, linetype = "dashed") +
  annotate("text", x = mean(offspring_per_dam$N_Offspring) + 0.3,
           y = Inf, vjust = 2, hjust = -0.1,
           label = paste("Mean =", round(mean(offspring_per_dam$N_Offspring), 1)),
           color = "#2166AC", fontface = "bold", size = 4) +
  labs(title = "Distribution of Offspring per Dam",
       subtitle = paste("N dams =", nrow(offspring_per_dam)),
       x = "Number of Offspring", y = "Number of Dams")
Distribution of offspring per dam.

Distribution of offspring per dam.

4.3 Sire × Dam Mating Combinations

sire_dam_combo <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  count(SIRENO, DAMNO, name = "N_Offspring") %>%
  arrange(desc(N_Offspring))

sire_dam_summary <- sire_dam_combo %>%
  summarise(
    N_Matings       = n(),
    Mean_Offspring  = round(mean(N_Offspring), 2),
    SD_Offspring    = round(sd(N_Offspring), 2),
    Min             = min(N_Offspring),
    Median          = median(N_Offspring),
    Max             = max(N_Offspring),
    Total_Offspring = sum(N_Offspring)
  )

kable(sire_dam_summary, caption = "Summary: Offspring per Sire-Dam Mating") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Summary: Offspring per Sire-Dam Mating
N_Matings Mean_Offspring SD_Offspring Min Median Max Total_Offspring
173 2.99 1.79 1 3 8 517

4.4 Full-Sib & Half-Sib Relationships

# Full-sibs
full_sib_families <- sire_dam_combo %>% filter(N_Offspring >= 2)
n_full_sib_families <- nrow(full_sib_families)
n_full_sib_pairs    <- sum(choose(full_sib_families$N_Offspring, 2))

# Full-sib family size distribution
fs_size_dist <- full_sib_families %>%
  count(N_Offspring, name = "N_Families") %>%
  arrange(N_Offspring)

# Paternal half-sibs
paternal_hs <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  group_by(SIRENO) %>%
  summarise(
    N_Total     = n(),
    N_Dams      = n_distinct(DAMNO),
    Total_Pairs = choose(N_Total, 2),
    .groups = "drop"
  )

fs_per_sire <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  count(SIRENO, DAMNO, name = "n") %>%
  group_by(SIRENO) %>%
  summarise(FS_Pairs = sum(choose(n, 2)), .groups = "drop")

paternal_hs <- paternal_hs %>%
  left_join(fs_per_sire, by = "SIRENO") %>%
  mutate(HS_Pairs = Total_Pairs - FS_Pairs)

n_paternal_hs_pairs <- sum(paternal_hs$HS_Pairs)
n_sires_multi_dam   <- sum(paternal_hs$N_Dams > 1)

# Maternal half-sibs
maternal_hs <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  group_by(DAMNO) %>%
  summarise(
    N_Total     = n(),
    N_Sires     = n_distinct(SIRENO),
    Total_Pairs = choose(N_Total, 2),
    .groups = "drop"
  )

fs_per_dam <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  count(SIRENO, DAMNO, name = "n") %>%
  group_by(DAMNO) %>%
  summarise(FS_Pairs = sum(choose(n, 2)), .groups = "drop")

maternal_hs <- maternal_hs %>%
  left_join(fs_per_dam, by = "DAMNO") %>%
  mutate(HS_Pairs = Total_Pairs - FS_Pairs)

n_maternal_hs_pairs <- sum(maternal_hs$HS_Pairs)
n_dams_multi_sire   <- sum(maternal_hs$N_Sires > 1)

4.4.1 Relationship Summary

rel_summary <- data.frame(
  Relationship = c("Full-sib families (sire-dam combos ≥ 2 offspring)",
                    "Full-sib pairs",
                    "Paternal half-sib pairs (same sire, different dam)",
                    "Maternal half-sib pairs (same dam, different sire)",
                    "Total half-sib pairs",
                    "Sires mated to multiple dams",
                    "Dams mated to multiple sires"),
  Count = c(n_full_sib_families,
            n_full_sib_pairs,
            n_paternal_hs_pairs,
            n_maternal_hs_pairs,
            n_paternal_hs_pairs + n_maternal_hs_pairs,
            n_sires_multi_dam,
            n_dams_multi_sire)
)

kable(rel_summary, caption = "Pedigree Relationship Summary",
      col.names = c("Relationship", "N")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Pedigree Relationship Summary
Relationship N
Full-sib families (sire-dam combos ≥ 2 offspring) 124
Full-sib pairs 790
Paternal half-sib pairs (same sire, different dam) 5656
Maternal half-sib pairs (same dam, different sire) 0
Total half-sib pairs 5656
Sires mated to multiple dams 23
Dams mated to multiple sires 0

4.4.2 Full-Sib Family Size Distribution

if (nrow(fs_size_dist) > 0) {
  ggplot(fs_size_dist, aes(x = factor(N_Offspring), y = N_Families)) +
    geom_col(fill = "#4393C3", alpha = 0.85, width = 0.7) +
    geom_text(aes(label = N_Families), vjust = -0.5, color = "#2166AC",
              fontface = "bold", size = 3.5) +
    labs(title = "Full-Sib Family Size Distribution",
         subtitle = "Number of families by offspring count per sire-dam combination",
         x = "Family Size (offspring per sire-dam combo)",
         y = "Number of Families")
}
Full-sib family size distribution.

Full-sib family size distribution.

The hierarchical mating design (sires mated to multiple dams) generates both full-sib and paternal half-sib families — essential for partitioning additive genetic variance (between-sire component) and common environmental/dominance effects (within-sire, between-dam component).


6 Body Weight Analysis

bw_cols <- c("BW1", "BW2", "BW3")
bw_cols <- bw_cols[bw_cols %in% names(dat)]

bw_by_sex <- dat %>%
  filter(!is.na(SEX)) %>%
  select(SEX, all_of(bw_cols)) %>%
  pivot_longer(cols = all_of(bw_cols), names_to = "Period", values_to = "BodyWeight") %>%
  filter(!is.na(BodyWeight)) %>%
  group_by(SEX, Period) %>%
  summarise(
    N      = n(),
    Mean   = round(mean(BodyWeight), 1),
    SD     = round(sd(BodyWeight), 1),
    `CV%`  = round(sd(BodyWeight) / mean(BodyWeight) * 100, 2),
    Min    = round(min(BodyWeight), 0),
    Median = round(median(BodyWeight), 0),
    Max    = round(max(BodyWeight), 0),
    .groups = "drop"
  ) %>%
  arrange(Period, SEX)

kable(bw_by_sex, caption = "Body Weight Summary by Sex and Period") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE)
Body Weight Summary by Sex and Period
SEX Period N Mean SD CV% Min Median Max
F BW1 497 1756.0 165.4 9.42 1193 1763 2168
F BW2 497 2462.3 260.5 10.58 1108 2458 3398
F BW3 497 2699.6 294.9 10.92 1385 2703 3508
dat %>%
  filter(!is.na(SEX)) %>%
  select(SEX, all_of(bw_cols)) %>%
  pivot_longer(cols = all_of(bw_cols), names_to = "Period", values_to = "BodyWeight") %>%
  filter(!is.na(BodyWeight)) %>%
  ggplot(aes(x = Period, y = BodyWeight, fill = SEX)) +
  geom_boxplot(alpha = 0.8, outlier.shape = 21, outlier.size = 2, width = 0.6) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3,
               color = "#D6604D", position = position_dodge(width = 0.6)) +
  labs(title = "Body Weight by Sex and Period",
       subtitle = "AMO Nigeria Poultry Data — diamonds show group means",
       x = "Body Weight Period", y = "Body Weight (g)", fill = "Sex") +
  scale_fill_manual(values = pal_sex)
Body weight distribution across measurement periods by sex.

Body weight distribution across measurement periods by sex.

⚠️ Note: Body weight records are available only for females (n=497). The 20 males lack BW data. Sire EBVs for BW can still be estimated from daughter performance.


7 Phenotypic Correlations

cor_data <- dat %>% select(all_of(trait_cols)) %>% drop_na()
cor_mat  <- round(cor(cor_data), 3)

kable(cor_mat, caption = paste0("Phenotypic Correlations (n = ", nrow(cor_data), " complete cases)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 11)
Phenotypic Correlations (n = 497 complete cases)
EW1 EW2 EW3 EWT BW1 BW2 BW3 EP LP TE AFEGG PP
EW1 1.000 0.310 0.116 0.487 -0.076 0.092 0.013 -0.103 0.053 -0.017 0.271 0.010
EW2 0.310 1.000 0.426 0.770 0.001 0.081 0.014 -0.064 0.029 -0.013 0.192 0.010
EW3 0.116 0.426 1.000 0.846 0.049 0.051 0.011 -0.029 0.046 0.014 0.067 0.023
EWT 0.487 0.770 0.846 1.000 0.012 0.090 0.019 -0.076 0.055 -0.002 0.196 0.018
BW1 -0.076 0.001 0.049 0.012 1.000 0.491 0.332 0.128 0.040 0.086 -0.400 0.042
BW2 0.092 0.081 0.051 0.090 0.491 1.000 0.609 -0.069 0.030 -0.014 0.008 -0.016
BW3 0.013 0.014 0.011 0.019 0.332 0.609 1.000 -0.094 -0.122 -0.122 -0.033 -0.128
EP -0.103 -0.064 -0.029 -0.076 0.128 -0.069 -0.094 1.000 0.629 0.874 -0.353 0.849
LP 0.053 0.029 0.046 0.055 0.040 0.030 -0.122 0.629 1.000 0.928 0.000 0.936
TE -0.017 -0.013 0.014 -0.002 0.086 -0.014 -0.122 0.874 0.928 1.000 -0.169 0.993
AFEGG 0.271 0.192 0.067 0.196 -0.400 0.008 -0.033 -0.353 0.000 -0.169 1.000 -0.070
PP 0.010 0.010 0.023 0.018 0.042 -0.016 -0.128 0.849 0.936 0.993 -0.070 1.000
# Heatmap visualization
cor_long <- as.data.frame(as.table(cor_mat)) %>%
  rename(Trait1 = Var1, Trait2 = Var2, r = Freq)

ggplot(cor_long, aes(x = Trait1, y = Trait2, fill = r)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = sprintf("%.2f", r)),
            size = 3, color = ifelse(abs(cor_long$r) > 0.5, "white", "#333333")) +
  scale_fill_gradient2(low = "#D6604D", mid = "white", high = "#2166AC",
                       midpoint = 0, limits = c(-1, 1),
                       name = "Correlation") +
  labs(title = "Phenotypic Correlation Matrix",
       subtitle = "AMO Nigeria Poultry — All measured traits") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_blank(),
        panel.grid = element_blank())
Phenotypic correlation heatmap for all traits.

Phenotypic correlation heatmap for all traits.

Critical correlations for breeding:

key_corr <- data.frame(
  `Trait Pair` = c("EP ↔ TE", "LP ↔ TE", "BW1 ↔ AFEGG",
                    "BW2 ↔ BW3", "BW1 ↔ BW2", "EW1 ↔ EP", "BW3 ↔ TE"),
  r = c(cor_mat["EP","TE"], cor_mat["LP","TE"],
        cor_mat["BW1","AFEGG"], cor_mat["BW2","BW3"], cor_mat["BW1","BW2"],
        cor_mat["EW1","EP"], cor_mat["BW3","TE"]),
  Implication = c(
    "Early production strongly predicts total production — enables early selection",
    "Late production dominates total — persistency matters",
    "Heavier birds mature earlier — biologically meaningful",
    "Strong repeatability across body weight periods",
    "Moderate repeatability — early BW predicts later BW",
    "Mild antagonism between egg size and egg number",
    "Weak negative — classic layer BW vs production trade-off"
  ),
  check.names = FALSE
)

kable(key_corr, caption = "Key Correlations for Breeding Strategy") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE)
Key Correlations for Breeding Strategy
Trait Pair r Implication
EP ↔︎ TE 0.874 Early production strongly predicts total production — enables early selection
LP ↔︎ TE 0.928 Late production dominates total — persistency matters
BW1 ↔︎ AFEGG -0.400 Heavier birds mature earlier — biologically meaningful
BW2 ↔︎ BW3 0.609 Strong repeatability across body weight periods
BW1 ↔︎ BW2 0.491 Moderate repeatability — early BW predicts later BW
EW1 ↔︎ EP -0.103 Mild antagonism between egg size and egg number
BW3 ↔︎ TE -0.122 Weak negative — classic layer BW vs production trade-off

8 Readiness for Genetic Evaluation

Requirement Status Assessment
Phenotypic records 12 traits, 497–520 records each
Pedigree (sire–dam) 3-generation depth (GSIRENO, SIRENO, DAMNO)
Multi-generation data ≥ 2 generations; 4 sires & 20 dams phenotyped
Parent–offspring links Enables genetic trend & parent–offspring regression
Family structure 124 full-sib families; 5656 paternal HS pairs
Fixed effects SEX, GCODE, CAGENO available
Genotype data SNP genotypes available — genomic selection feasible
Population size ⚠️ 520 birds — adequate for sire model; expand for higher accuracy
Male BW records 20 males lack body weight data

8.1 Feasible Analyses

Analysis Feasibility Software
Sire model BLUP ✅ Recommended BLUPF90, ASReml
Animal model BLUP ✅ Feasible BLUPF90, ASReml
Variance components (REML) ✅ Feasible AIREMLF90, ASReml
Parent–offspring regression ✅ Feasible R
Genetic trend analysis ✅ Feasible R, BLUPF90
Multi-trait animal model ⚠️ Feasible (2–3 traits) AIREMLF90
Repeatability model (BW) ✅ Feasible BLUPF90
Selection index optimization ✅ Feasible R (Smith-Hazel)
Single-step GBLUP (ssGBLUP) ✅ Feasible BLUPF90+, PREGSF90
GBLUP / Bayesian methods ✅ Feasible JWAS, BayesR, rrBLUP
GWAS ✅ Feasible POSTGSF90, GEMMA

9 Expected Genetic Parameters

Based on literature for similar tropical poultry populations. These should be estimated from this data using REML.

Trait Expected h² Literature Range Comment
Body weight (BW1–BW3) 0.25–0.45 0.20–0.55 Moderate-to-high; responds well to selection
Egg weight (EW1–EW3) 0.40–0.60 0.35–0.65 Among the most heritable poultry traits
Age at first egg (AFEGG) 0.25–0.40 0.20–0.45 Moderate heritability
Egg production (EP, TE) 0.10–0.25 0.05–0.30 Low-to-moderate; large environmental influence
Persistency (LP/PP) 0.05–0.15 0.03–0.20 Low; difficult to improve by selection alone

10 Genomic Selection

Genotype data (SNP markers) is available for this population, making genomic selection feasible. Combined with the multi-generation pedigree and phenotypic records, the following genomic analyses can be implemented:

Analysis Method Software
Single-step GBLUP Combines A and G matrices via H-matrix BLUPF90+ (PREGSF90)
GBLUP G-matrix replaces A for genotyped animals BLUPF90, rrBLUP, JWAS
Bayesian methods BayesA, BayesB, BayesCπ, BayesR JWAS (Julia), GenSel
GWAS Single-SNP or window-based association POSTGSF90, GEMMA
Cross-validation Gen1 training, Gen2 validation Custom R/Julia scripts

Implementation steps:

  1. Quality control of genotype data (call rate, MAF, HWE, parent–offspring conflicts) → PLINK, PREGSF90
  2. Build G-matrix and blend with pedigree A-matrix → PREGSF90
  3. Estimate variance components with genomic information → AIREMLF90 (ssGBLUP)
  4. Compute GEBVs for all traits → BLUPF90+
  5. Cross-validate prediction accuracy → Custom scripts
  6. Run GWAS to identify significant QTL regions → POSTGSF90
  7. Construct optimized genomic selection index → R

11 Predicted Genetic Gain Scenarios

Genetic gain per generation: ΔG = i × rTI × σA

# Use actual data for sigma_P
sigma_TE    <- desc_stats %>% filter(Trait == "TE") %>% pull(SD)
mean_TE     <- desc_stats %>% filter(Trait == "TE") %>% pull(Mean)
sigma_AFEGG <- desc_stats %>% filter(Trait == "AFEGG") %>% pull(SD)
mean_AFEGG  <- desc_stats %>% filter(Trait == "AFEGG") %>% pull(Mean)

# Scenario calculations
scenarios <- data.frame(
  Scenario = c(
    "1: Pedigree BLUP — TE (top 10% females)",
    "2: Pedigree BLUP — AFEGG (top 20%)",
    "3: Multi-trait index (TE + EW + AFEGG)",
    "4: Genomic selection — TE (ssGBLUP)"
  ),
  h2 = c(0.15, 0.30, NA, 0.15),
  i = c(1.76, 1.40, 1.40, 1.76),
  r_TI = c(0.45, 0.55, 0.50, 0.60),
  sigma_P = c(sigma_TE, sigma_AFEGG, sigma_TE, sigma_TE),
  stringsAsFactors = FALSE
) %>%
  mutate(
    sigma_A = ifelse(!is.na(h2), round(sqrt(h2) * sigma_P, 2), NA),
    Delta_G = ifelse(!is.na(sigma_A), round(i * r_TI * sigma_A, 1), NA)
  )

# Scenario 3: approximate multi-trait gain (reduced from single-trait by ~30%)
scenarios$Delta_G[3] <- round(scenarios$Delta_G[1] * 0.70, 1)
scenarios$sigma_A[3] <- NA

gain_display <- scenarios %>%
  select(Scenario, Delta_G) %>%
  mutate(
    Unit = c("eggs/gen", "days/gen", "eggs/gen (TE component)", "eggs/gen"),
    Delta_G = c(
      paste0("+", Delta_G[1]),
      paste0(Delta_G[2]),
      paste0("+", Delta_G[3]),
      paste0("+", Delta_G[4])
    )
  )

kable(gain_display, caption = "Predicted Genetic Gain per Generation",
      col.names = c("Scenario", "ΔG", "Unit")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Predicted Genetic Gain per Generation
Scenario ΔG Unit
1: Pedigree BLUP — TE (top 10% females) +20.3 eggs/gen
2: Pedigree BLUP — AFEGG (top 20%) 4.3 days/gen
3: Multi-trait index (TE + EW + AFEGG) +14.2 eggs/gen (TE component)
4: Genomic selection — TE (ssGBLUP) +27.1 eggs/gen
if (nrow(gen_summary) == 2) {
  observed_TE_change <- g2$Mean_TE - g1$Mean_TE
  estimated_genetic  <- round(observed_TE_change / 2, 1)
  predicted_blup     <- scenarios$Delta_G[1]
  multiplier         <- round(as.numeric(predicted_blup) / estimated_genetic, 0)
  
  cat(paste0('\n\n<div class="success-box">\n',
             '**📈 Observed vs. Predicted:** The observed phenotypic trend is +',
             round(observed_TE_change, 1), ' eggs in TE (Gen1 → Gen2). ',
             'Assuming ~50% is genetic, the realized gain is ~', estimated_genetic,
             ' eggs/generation. Formal BLUP selection (predicted: +', predicted_blup,
             ' eggs) could multiply progress by approximately **', multiplier, 'x**.\n',
             '</div>\n\n'))
}

📈 Observed vs. Predicted: The observed phenotypic trend is +7.6 eggs in TE (Gen1 → Gen2). Assuming ~50% is genetic, the realized gain is ~3.8 eggs/generation. Formal BLUP selection (predicted: +20.3 eggs) could multiply progress by approximately 5x.

11.1 Realistic Timeline

Timeframe Generations Expected Cumulative Gain (TE) Approach
Year 1–2 1 +12–18 eggs Pedigree BLUP / ssGBLUP
Year 3–5 2–3 +25–45 eggs ssGBLUP with expanded recording
Year 5–8 3–5 +50–80 eggs Mature genomic selection program

12 Recommendations & Next Steps

12.1 Immediate (0–6 months)

12.1.1 Priority Actions

  1. Run genetic evaluation (pedigree + genomic): Fit single-step GBLUP using BLUPF90 suite — PREGSF90 for genotype QC and G-matrix, AIREMLF90 for variance components, BLUPF90+ for GEBVs. Start with single-trait models for TE, AFEGG, EW, and BW.

  2. Estimate genetic parameters: Use REML (AIREMLF90) to estimate heritabilities and genetic correlations. Validate with parent–offspring regression using the 4 sire and 20 dam links.

  3. Estimate genetic trends: Compare mean EBVs/GEBVs between Gen1 and Gen2 to quantify realized genetic progress.

  4. Record male body weights: Begin recording BW for all 20 males to improve sire EBV accuracy.

  5. Investigate zero-production records: Check whether zero values in EP, LP, TE represent true non-producers, early culling, or data entry errors.

12.2 Medium-Term (6–18 months)

  1. Develop a multi-trait selection index: Construct a balanced index with economic weights for TE, EW, BW, and AFEGG using Smith-Hazel methodology.

  2. Expand recording: Add feed intake (FCR), egg quality traits, and livability data.

  3. Increase population size: Target ≥ 1,000 birds per generation for robust parameter estimation and genomic prediction accuracy.

12.3 Long-Term (1–3 years)

  1. Scale genomic selection: Genotype all selection candidates at hatch to maximize benefit of reduced generation interval.

  2. Genetic dissemination: Develop a strategy for how improved genetics from AMO farm flow to multiplier and commercial flocks.