library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)

# ── Custom ggplot2 theme matching the report palette (colorblind-friendly) ──
theme_report <- theme_minimal(base_size = 13) +
  theme(
    plot.title       = element_text(face = "bold", size = 15, color = "#2166AC"),
    plot.subtitle    = element_text(color = "#6c757d", size = 11),
    plot.background  = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA),
    panel.grid.major = element_line(color = "#f0f0f0", linewidth = 0.4),
    panel.grid.minor = element_blank(),
    axis.title       = element_text(size = 11, color = "#495057", face = "bold"),
    axis.text        = element_text(size = 10, color = "#6c757d"),
    legend.position  = "top",
    legend.title     = element_text(face = "bold", size = 10),
    legend.text      = element_text(size = 10),
    strip.text       = element_text(face = "bold", size = 11, color = "#2166AC"),
    strip.background = element_rect(fill = "#D1E5F0", color = NA)
  )
theme_set(theme_report)

# ── Colorblind-friendly palettes (RdBu / Blue-Orange diverging) ──
pal_primary <- c("#2166AC", "#4393C3", "#67A9CF", "#92C5DE", "#D1E5F0")
pal_accent  <- c("#D6604D", "#F4A582", "#FDDBC7", "#4393C3", "#2166AC")
pal_sex     <- c("F" = "#4393C3", "M" = "#D6604D")

working_dir <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/Nigeria_poultry"
setwd(working_dir)
file_path <- "AMO NIGERIA DATA.xlsx"
dat <- read_excel(file_path, sheet = 1)

dat <- dat %>%
  select(-any_of(c("INDEX", "REMARK", "FILTER_NUM", "SPOT_NUMB", 
                    "TAGNO", "SAMPLE_UNI"))) %>%
  mutate(
    SIRENO  = as.character(SIRENO),
    DAMNO   = as.character(DAMNO),
    GSIRENO = as.character(GSIRENO),
    WINGNO  = as.character(WINGNO),
    SEX     = as.factor(SEX),
    GCODE   = as.factor(GCODE)
  )

1 Executive Summary

This report provides a data-driven assessment of the AMO Nigeria poultry farm dataset for genetic evaluation and breeding strategy development.

n_animals <- nrow(dat)
n_females <- sum(dat$SEX == "F", na.rm = TRUE)
n_males   <- sum(dat$SEX == "M", na.rm = TRUE)
n_sires   <- length(unique(na.omit(dat$SIRENO)))
n_dams    <- length(unique(na.omit(dat$DAMNO)))
n_gsires  <- length(unique(na.omit(dat$GSIRENO)))

sires_in_data <- sum(unique(na.omit(dat$SIRENO)) %in% dat$WINGNO)
dams_in_data  <- sum(unique(na.omit(dat$DAMNO)) %in% dat$WINGNO)

# Generation assignment
gen1_sires <- dat$WINGNO[dat$WINGNO %in% dat$SIRENO]
gen1_dams  <- dat$WINGNO[dat$WINGNO %in% dat$DAMNO]
dat$generation <- ifelse(dat$WINGNO %in% c(gen1_sires, gen1_dams), "Gen1_Parent", "Gen2_Offspring")
n_gen1 <- sum(dat$generation == "Gen1_Parent")
n_gen2 <- sum(dat$generation == "Gen2_Offspring")

Parameter	Value
Total birds	520
Females / Males	500 / 20
Unique sires	31
Unique dams	173
Unique grand-sires	14
Sires with own phenotype	4
Dams with own phenotype	20
Generation 1 (parents)	24
Generation 2 (offspring)	496
Generations spanned	≥ 2 (overlapping)

2 Data Overview & Structure

The dataset contains 520 records with 20 columns spanning identification, pedigree, and phenotypic variables.

2.1 Missing Values

missing_summary <- dat %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") %>%
  mutate(Pct_Missing = round(N_Missing / nrow(dat) * 100, 2)) %>%
  filter(N_Missing > 0) %>%
  arrange(desc(N_Missing))

if (nrow(missing_summary) > 0) {
  kable(missing_summary, caption = "Variables with Missing Values") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
} else {
  cat("No missing values detected.")
}

Variables with Missing Values
Variable	N_Missing	Pct_Missing
CAGENO	23	4.42
EW1	23	4.42
EW2	23	4.42
EW3	23	4.42
EWT	23	4.42
BW1	23	4.42
BW2	23	4.42
BW3	23	4.42
EP	23	4.42
LP	23	4.42
AFEGG	23	4.42
PP	23	4.42
GSIRENO	3	0.58
SIRENO	3	0.58
DAMNO	3	0.58

2.2 Sex Distribution

sex_tab <- dat %>%
  count(SEX, name = "Count") %>%
  mutate(Percent = round(Count / sum(Count) * 100, 2))

kable(sex_tab, caption = "Sex Distribution") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Sex Distribution
SEX	Count	Percent
F	500	96.15
M	20	3.85

2.3 Genetic Code Distribution

gcode_tab <- dat %>%
  count(GCODE, name = "Count") %>%
  mutate(Percent = round(Count / sum(Count) * 100, 2)) %>%
  arrange(desc(Count))

kable(gcode_tab, caption = "Genetic Code (GCODE) Distribution") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Genetic Code (GCODE) Distribution
GCODE	Count	Percent
K10	520	100

3 Descriptive Statistics — Production & Growth Traits

trait_cols <- c("EW1", "EW2", "EW3", "EWT", "BW1", "BW2", "BW3",
                "EP", "LP", "TE", "AFEGG", "PP")
trait_cols <- trait_cols[trait_cols %in% names(dat)]

desc_stats <- dat %>%
  select(all_of(trait_cols)) %>%
  pivot_longer(everything(), names_to = "Trait", values_to = "Value") %>%
  filter(!is.na(Value)) %>%
  group_by(Trait) %>%
  summarise(
    N      = n(),
    Mean   = round(mean(Value), 2),
    SD     = round(sd(Value), 2),
    `CV%`  = round(sd(Value) / mean(Value) * 100, 2),
    Min    = round(min(Value), 2),
    Q1     = round(quantile(Value, 0.25), 2),
    Median = round(median(Value), 2),
    Q3     = round(quantile(Value, 0.75), 2),
    Max    = round(max(Value), 2),
    .groups = "drop"
  ) %>%
  arrange(Trait)

kable(desc_stats, caption = "Descriptive Statistics for All Traits") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE) %>%
  row_spec(0, bold = TRUE)

Descriptive Statistics for All Traits
Trait	N	Mean	SD	CV%	Min	Q1	Median	Q3	Max
AFEGG	497	149.27	10.09	6.76	125.0	143.00	149.0	156.00	200.00
BW1	497	1756.04	165.43	9.42	1193.0	1651.00	1763.0	1863.00	2168.00
BW2	497	2462.32	260.55	10.58	1108.0	2293.00	2458.0	2638.00	3398.00
BW3	497	2699.64	294.86	10.92	1385.0	2533.00	2703.0	2868.00	3508.00
EP	497	107.16	28.20	26.32	-3.0	95.00	115.0	127.00	153.00
EW1	497	43.51	2.87	6.59	32.2	41.80	43.6	45.40	56.90
EW2	497	55.00	4.29	7.80	35.0	52.50	55.0	57.80	69.00
EW3	497	60.17	6.48	10.76	32.0	57.00	60.0	63.10	108.00
EWT	497	52.80	3.40	6.44	39.9	50.80	52.9	54.90	68.80
LP	497	53.06	36.72	69.20	-3.0	14.00	66.0	86.00	108.00
PP	497	61.25	22.13	36.14	0.7	44.60	69.2	78.91	93.14
TE	520	153.14	66.19	43.22	-5.0	105.75	178.5	205.00	252.00

3.1 Egg Production Trait Distributions

egg_cols <- c("EP", "LP", "TE", "AFEGG")
egg_cols <- egg_cols[egg_cols %in% names(dat)]

# Full trait name labels
egg_labels <- c(
  "EP"    = "Early Egg Production",
  "LP"    = "Late Egg Production",
  "TE"    = "Total Egg Number",
  "AFEGG" = "Age at First Egg (days)"
)

dat %>%
  select(all_of(egg_cols)) %>%
  pivot_longer(everything(), names_to = "Trait", values_to = "Value") %>%
  filter(!is.na(Value)) %>%
  mutate(Trait = factor(Trait, levels = names(egg_labels), labels = egg_labels)) %>%
  ggplot(aes(x = Value)) +
  geom_histogram(aes(y = after_stat(density)), fill = "#4393C3", color = "white",
                 alpha = 0.85, bins = 30) +
  geom_density(color = "#D6604D", linewidth = 1, alpha = 0.7) +
  facet_wrap(~Trait, scales = "free") +
  labs(title = "Distribution of Egg Production Traits",
       subtitle = "AMO Nigeria Poultry Data — Histogram with density overlay",
       x = "Value", y = "Density")

Distribution of egg production traits.

Key observations:

Egg production traits exhibit high CV (26.32% for EP, 43.22% for TE) — excellent variation for selection.
Age at first egg averages 149.27 days (~21.3 weeks).
Body weights show moderate CV (9–11%), with progressive growth from BW1 to BW3.

4 Pedigree & Family Structure

4.1 Offspring per Sire

offspring_per_sire <- dat %>%
  filter(!is.na(SIRENO)) %>%
  count(SIRENO, name = "N_Offspring") %>%
  arrange(desc(N_Offspring))

sire_summary <- offspring_per_sire %>%
  summarise(
    N_Sires        = n(),
    Mean_Offspring  = round(mean(N_Offspring), 2),
    SD_Offspring    = round(sd(N_Offspring), 2),
    Min             = min(N_Offspring),
    Median          = median(N_Offspring),
    Max             = max(N_Offspring),
    Total_Offspring = sum(N_Offspring)
  )

kable(sire_summary, caption = "Summary: Offspring per Sire") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Summary: Offspring per Sire
N_Sires	Mean_Offspring	SD_Offspring	Min	Median	Max	Total_Offspring
31	16.68	12.63	1	17	40	517

ggplot(offspring_per_sire, aes(x = N_Offspring)) +
  geom_histogram(binwidth = 1, fill = "#4393C3", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(N_Offspring)), color = "#D6604D",
             linewidth = 1.2, linetype = "dashed") +
  annotate("text", x = mean(offspring_per_sire$N_Offspring) + 0.5,
           y = Inf, vjust = 2, hjust = -0.1,
           label = paste("Mean =", round(mean(offspring_per_sire$N_Offspring), 1)),
           color = "#D6604D", fontface = "bold", size = 4) +
  labs(title = "Distribution of Offspring per Sire",
       subtitle = paste("N sires =", nrow(offspring_per_sire)),
       x = "Number of Offspring", y = "Number of Sires")

Distribution of offspring per sire.

kable(head(offspring_per_sire, 10), caption = "Top 10 Sires by Offspring Count") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Top 10 Sires by Offspring Count
SIRENO	N_Offspring
3659	40
2538	37
4043	35
4776	35
1661	31
1211	27
1662	27
1197	25
3620	25
2853	24

4.2 Offspring per Dam

offspring_per_dam <- dat %>%
  filter(!is.na(DAMNO)) %>%
  count(DAMNO, name = "N_Offspring") %>%
  arrange(desc(N_Offspring))

dam_summary <- offspring_per_dam %>%
  summarise(
    N_Dams         = n(),
    Mean_Offspring  = round(mean(N_Offspring), 2),
    SD_Offspring    = round(sd(N_Offspring), 2),
    Min             = min(N_Offspring),
    Median          = median(N_Offspring),
    Max             = max(N_Offspring),
    Total_Offspring = sum(N_Offspring)
  )

kable(dam_summary, caption = "Summary: Offspring per Dam") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Summary: Offspring per Dam
N_Dams	Mean_Offspring	SD_Offspring	Min	Median	Max	Total_Offspring
173	2.99	1.79	1	3	8	517

ggplot(offspring_per_dam, aes(x = N_Offspring)) +
  geom_histogram(binwidth = 1, fill = "#D6604D", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(N_Offspring)), color = "#2166AC",
             linewidth = 1.2, linetype = "dashed") +
  annotate("text", x = mean(offspring_per_dam$N_Offspring) + 0.3,
           y = Inf, vjust = 2, hjust = -0.1,
           label = paste("Mean =", round(mean(offspring_per_dam$N_Offspring), 1)),
           color = "#2166AC", fontface = "bold", size = 4) +
  labs(title = "Distribution of Offspring per Dam",
       subtitle = paste("N dams =", nrow(offspring_per_dam)),
       x = "Number of Offspring", y = "Number of Dams")

Distribution of offspring per dam.

4.3 Sire × Dam Mating Combinations

sire_dam_combo <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  count(SIRENO, DAMNO, name = "N_Offspring") %>%
  arrange(desc(N_Offspring))

sire_dam_summary <- sire_dam_combo %>%
  summarise(
    N_Matings       = n(),
    Mean_Offspring  = round(mean(N_Offspring), 2),
    SD_Offspring    = round(sd(N_Offspring), 2),
    Min             = min(N_Offspring),
    Median          = median(N_Offspring),
    Max             = max(N_Offspring),
    Total_Offspring = sum(N_Offspring)
  )

kable(sire_dam_summary, caption = "Summary: Offspring per Sire-Dam Mating") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Summary: Offspring per Sire-Dam Mating
N_Matings	Mean_Offspring	SD_Offspring	Min	Median	Max	Total_Offspring
173	2.99	1.79	1	3	8	517

4.4 Full-Sib & Half-Sib Relationships

# Full-sibs
full_sib_families <- sire_dam_combo %>% filter(N_Offspring >= 2)
n_full_sib_families <- nrow(full_sib_families)
n_full_sib_pairs    <- sum(choose(full_sib_families$N_Offspring, 2))

# Full-sib family size distribution
fs_size_dist <- full_sib_families %>%
  count(N_Offspring, name = "N_Families") %>%
  arrange(N_Offspring)

# Paternal half-sibs
paternal_hs <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  group_by(SIRENO) %>%
  summarise(
    N_Total     = n(),
    N_Dams      = n_distinct(DAMNO),
    Total_Pairs = choose(N_Total, 2),
    .groups = "drop"
  )

fs_per_sire <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  count(SIRENO, DAMNO, name = "n") %>%
  group_by(SIRENO) %>%
  summarise(FS_Pairs = sum(choose(n, 2)), .groups = "drop")

paternal_hs <- paternal_hs %>%
  left_join(fs_per_sire, by = "SIRENO") %>%
  mutate(HS_Pairs = Total_Pairs - FS_Pairs)

n_paternal_hs_pairs <- sum(paternal_hs$HS_Pairs)
n_sires_multi_dam   <- sum(paternal_hs$N_Dams > 1)

# Maternal half-sibs
maternal_hs <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  group_by(DAMNO) %>%
  summarise(
    N_Total     = n(),
    N_Sires     = n_distinct(SIRENO),
    Total_Pairs = choose(N_Total, 2),
    .groups = "drop"
  )

fs_per_dam <- dat %>%
  filter(!is.na(SIRENO) & !is.na(DAMNO)) %>%
  count(SIRENO, DAMNO, name = "n") %>%
  group_by(DAMNO) %>%
  summarise(FS_Pairs = sum(choose(n, 2)), .groups = "drop")

maternal_hs <- maternal_hs %>%
  left_join(fs_per_dam, by = "DAMNO") %>%
  mutate(HS_Pairs = Total_Pairs - FS_Pairs)

n_maternal_hs_pairs <- sum(maternal_hs$HS_Pairs)
n_dams_multi_sire   <- sum(maternal_hs$N_Sires > 1)

4.4.1 Relationship Summary

rel_summary <- data.frame(
  Relationship = c("Full-sib families (sire-dam combos ≥ 2 offspring)",
                    "Full-sib pairs",
                    "Paternal half-sib pairs (same sire, different dam)",
                    "Maternal half-sib pairs (same dam, different sire)",
                    "Total half-sib pairs",
                    "Sires mated to multiple dams",
                    "Dams mated to multiple sires"),
  Count = c(n_full_sib_families,
            n_full_sib_pairs,
            n_paternal_hs_pairs,
            n_maternal_hs_pairs,
            n_paternal_hs_pairs + n_maternal_hs_pairs,
            n_sires_multi_dam,
            n_dams_multi_sire)
)

kable(rel_summary, caption = "Pedigree Relationship Summary",
      col.names = c("Relationship", "N")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Pedigree Relationship Summary
Relationship	N
Full-sib families (sire-dam combos ≥ 2 offspring)	124
Full-sib pairs	790
Paternal half-sib pairs (same sire, different dam)	5656
Maternal half-sib pairs (same dam, different sire)	0
Total half-sib pairs	5656
Sires mated to multiple dams	23
Dams mated to multiple sires	0

4.4.2 Full-Sib Family Size Distribution

if (nrow(fs_size_dist) > 0) {
  ggplot(fs_size_dist, aes(x = factor(N_Offspring), y = N_Families)) +
    geom_col(fill = "#4393C3", alpha = 0.85, width = 0.7) +
    geom_text(aes(label = N_Families), vjust = -0.5, color = "#2166AC",
              fontface = "bold", size = 3.5) +
    labs(title = "Full-Sib Family Size Distribution",
         subtitle = "Number of families by offspring count per sire-dam combination",
         x = "Family Size (offspring per sire-dam combo)",
         y = "Number of Families")
}

Full-sib family size distribution.

The hierarchical mating design (sires mated to multiple dams) generates both full-sib and paternal half-sib families — essential for partitioning additive genetic variance (between-sire component) and common environmental/dominance effects (within-sire, between-dam component).

5 Multi-Generation Structure & Phenotypic Trends

Cross-referencing WINGNO against SIRENO and DAMNO reveals that 4 sires and 20 dams are themselves phenotyped in this dataset, confirming at least two overlapping generations.

gen_summary <- dat %>%
  group_by(generation) %>%
  summarise(
    N         = n(),
    Mean_TE   = round(mean(TE, na.rm = TRUE), 1),
    Mean_EP   = round(mean(EP, na.rm = TRUE), 1),
    Mean_AFEGG = round(mean(AFEGG, na.rm = TRUE), 1),
    Mean_BW1  = round(mean(BW1, na.rm = TRUE), 1),
    Mean_EWT  = round(mean(EWT, na.rm = TRUE), 1),
    .groups = "drop"
  )

kable(gen_summary, caption = "Phenotypic Means by Generation",
      col.names = c("Generation", "N", "TE", "EP", "AFEGG", "BW1", "EWT")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Phenotypic Means by Generation
Generation	N	TE	EP	AFEGG	BW1	EWT
Gen1_Parent	24	145.9	101.9	150.2	1747.3	53.5
Gen2_Offspring	496	153.5	107.4	149.2	1756.5	52.8

if (nrow(gen_summary) == 2) {
  g1 <- gen_summary %>% filter(generation == "Gen1_Parent")
  g2 <- gen_summary %>% filter(generation == "Gen2_Offspring")
  
  trend <- data.frame(
    Trait = c("TE (total eggs)", "EP (early production)", "AFEGG (days)", "BW1 (g)", "EWT (g)"),
    Gen1  = c(g1$Mean_TE, g1$Mean_EP, g1$Mean_AFEGG, g1$Mean_BW1, g1$Mean_EWT),
    Gen2  = c(g2$Mean_TE, g2$Mean_EP, g2$Mean_AFEGG, g2$Mean_BW1, g2$Mean_EWT)
  ) %>%
    mutate(
      Change = round(Gen2 - Gen1, 1),
      `Change%` = round((Gen2 - Gen1) / Gen1 * 100, 1),
      Direction = case_when(
        Trait == "AFEGG (days)" & Change < 0 ~ "✅ Favorable ↓",
        Trait == "AFEGG (days)" & Change >= 0 ~ "⚠️ Unfavorable ↑",
        Change > 1 ~ "✅ Favorable ↑",
        Change < -1 ~ "⚠️ Check ↓",
        TRUE ~ "➡️ Stable →"
      )
    )
  
  kable(trend, caption = "Phenotypic Trends: Gen1 (Parents) → Gen2 (Offspring)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
}

Phenotypic Trends: Gen1 (Parents) → Gen2 (Offspring)
Trait	Gen1	Gen2	Change	Change%	Direction
TE (total eggs)	145.9	153.5	7.6	5.2	✅ Favorable ↑ \|
EP (early production)	101.9	107.4	5.5	5.4	✅ Favorable ↑ \|
AFEGG (days)	150.2	149.2	-1.0	-0.7	✅ Favorable ↓ \|
BW1 (g)	1747.3	1756.5	9.2	0.5	✅ Favorable ↑ \|
EWT (g)	53.5	52.8	-0.7	-1.3	➡️ Stable →

The multi-generation structure enables parent–offspring regression for heritability estimation, genetic trend analysis, and cross-validation of prediction accuracy.

6 Body Weight Analysis

bw_cols <- c("BW1", "BW2", "BW3")
bw_cols <- bw_cols[bw_cols %in% names(dat)]

bw_by_sex <- dat %>%
  filter(!is.na(SEX)) %>%
  select(SEX, all_of(bw_cols)) %>%
  pivot_longer(cols = all_of(bw_cols), names_to = "Period", values_to = "BodyWeight") %>%
  filter(!is.na(BodyWeight)) %>%
  group_by(SEX, Period) %>%
  summarise(
    N      = n(),
    Mean   = round(mean(BodyWeight), 1),
    SD     = round(sd(BodyWeight), 1),
    `CV%`  = round(sd(BodyWeight) / mean(BodyWeight) * 100, 2),
    Min    = round(min(BodyWeight), 0),
    Median = round(median(BodyWeight), 0),
    Max    = round(max(BodyWeight), 0),
    .groups = "drop"
  ) %>%
  arrange(Period, SEX)

kable(bw_by_sex, caption = "Body Weight Summary by Sex and Period") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE)

Body Weight Summary by Sex and Period
SEX	Period	N	Mean	SD	CV%	Min	Median	Max
F	BW1	497	1756.0	165.4	9.42	1193	1763	2168
F	BW2	497	2462.3	260.5	10.58	1108	2458	3398
F	BW3	497	2699.6	294.9	10.92	1385	2703	3508

dat %>%
  filter(!is.na(SEX)) %>%
  select(SEX, all_of(bw_cols)) %>%
  pivot_longer(cols = all_of(bw_cols), names_to = "Period", values_to = "BodyWeight") %>%
  filter(!is.na(BodyWeight)) %>%
  ggplot(aes(x = Period, y = BodyWeight, fill = SEX)) +
  geom_boxplot(alpha = 0.8, outlier.shape = 21, outlier.size = 2, width = 0.6) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3,
               color = "#D6604D", position = position_dodge(width = 0.6)) +
  labs(title = "Body Weight by Sex and Period",
       subtitle = "AMO Nigeria Poultry Data — diamonds show group means",
       x = "Body Weight Period", y = "Body Weight (g)", fill = "Sex") +
  scale_fill_manual(values = pal_sex)

Body weight distribution across measurement periods by sex.

⚠️ Note: Body weight records are available only for females (n=497). The 20 males lack BW data. Sire EBVs for BW can still be estimated from daughter performance.

7 Phenotypic Correlations

cor_data <- dat %>% select(all_of(trait_cols)) %>% drop_na()
cor_mat  <- round(cor(cor_data), 3)

kable(cor_mat, caption = paste0("Phenotypic Correlations (n = ", nrow(cor_data), " complete cases)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 11)

Phenotypic Correlations (n = 497 complete cases)
	EW1	EW2	EW3	EWT	BW1	BW2	BW3	EP	LP	TE	AFEGG	PP
EW1	1.000	0.310	0.116	0.487	-0.076	0.092	0.013	-0.103	0.053	-0.017	0.271	0.010
EW2	0.310	1.000	0.426	0.770	0.001	0.081	0.014	-0.064	0.029	-0.013	0.192	0.010
EW3	0.116	0.426	1.000	0.846	0.049	0.051	0.011	-0.029	0.046	0.014	0.067	0.023
EWT	0.487	0.770	0.846	1.000	0.012	0.090	0.019	-0.076	0.055	-0.002	0.196	0.018
BW1	-0.076	0.001	0.049	0.012	1.000	0.491	0.332	0.128	0.040	0.086	-0.400	0.042
BW2	0.092	0.081	0.051	0.090	0.491	1.000	0.609	-0.069	0.030	-0.014	0.008	-0.016
BW3	0.013	0.014	0.011	0.019	0.332	0.609	1.000	-0.094	-0.122	-0.122	-0.033	-0.128
EP	-0.103	-0.064	-0.029	-0.076	0.128	-0.069	-0.094	1.000	0.629	0.874	-0.353	0.849
LP	0.053	0.029	0.046	0.055	0.040	0.030	-0.122	0.629	1.000	0.928	0.000	0.936
TE	-0.017	-0.013	0.014	-0.002	0.086	-0.014	-0.122	0.874	0.928	1.000	-0.169	0.993
AFEGG	0.271	0.192	0.067	0.196	-0.400	0.008	-0.033	-0.353	0.000	-0.169	1.000	-0.070
PP	0.010	0.010	0.023	0.018	0.042	-0.016	-0.128	0.849	0.936	0.993	-0.070	1.000

# Heatmap visualization
cor_long <- as.data.frame(as.table(cor_mat)) %>%
  rename(Trait1 = Var1, Trait2 = Var2, r = Freq)

ggplot(cor_long, aes(x = Trait1, y = Trait2, fill = r)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = sprintf("%.2f", r)),
            size = 3, color = ifelse(abs(cor_long$r) > 0.5, "white", "#333333")) +
  scale_fill_gradient2(low = "#D6604D", mid = "white", high = "#2166AC",
                       midpoint = 0, limits = c(-1, 1),
                       name = "Correlation") +
  labs(title = "Phenotypic Correlation Matrix",
       subtitle = "AMO Nigeria Poultry — All measured traits") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_blank(),
        panel.grid = element_blank())

Phenotypic correlation heatmap for all traits.

Critical correlations for breeding:

key_corr <- data.frame(
  `Trait Pair` = c("EP ↔ TE", "LP ↔ TE", "BW1 ↔ AFEGG",
                    "BW2 ↔ BW3", "BW1 ↔ BW2", "EW1 ↔ EP", "BW3 ↔ TE"),
  r = c(cor_mat["EP","TE"], cor_mat["LP","TE"],
        cor_mat["BW1","AFEGG"], cor_mat["BW2","BW3"], cor_mat["BW1","BW2"],
        cor_mat["EW1","EP"], cor_mat["BW3","TE"]),
  Implication = c(
    "Early production strongly predicts total production — enables early selection",
    "Late production dominates total — persistency matters",
    "Heavier birds mature earlier — biologically meaningful",
    "Strong repeatability across body weight periods",
    "Moderate repeatability — early BW predicts later BW",
    "Mild antagonism between egg size and egg number",
    "Weak negative — classic layer BW vs production trade-off"
  ),
  check.names = FALSE
)

kable(key_corr, caption = "Key Correlations for Breeding Strategy") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = TRUE)

Key Correlations for Breeding Strategy
Trait Pair	r	Implication
EP ↔︎ TE	0.874	Early production strongly predicts total production — enables early selection
LP ↔︎ TE	0.928	Late production dominates total — persistency matters
BW1 ↔︎ AFEGG	-0.400	Heavier birds mature earlier — biologically meaningful
BW2 ↔︎ BW3	0.609	Strong repeatability across body weight periods
BW1 ↔︎ BW2	0.491	Moderate repeatability — early BW predicts later BW
EW1 ↔︎ EP	-0.103	Mild antagonism between egg size and egg number
BW3 ↔︎ TE	-0.122	Weak negative — classic layer BW vs production trade-off

8 Readiness for Genetic Evaluation

Requirement	Status	Assessment
Phenotypic records	✅	12 traits, 497–520 records each
Pedigree (sire–dam)	✅	3-generation depth (GSIRENO, SIRENO, DAMNO)
Multi-generation data	✅	≥ 2 generations; 4 sires & 20 dams phenotyped
Parent–offspring links	✅	Enables genetic trend & parent–offspring regression
Family structure	✅	124 full-sib families; 5656 paternal HS pairs
Fixed effects	✅	SEX, GCODE, CAGENO available
Genotype data	✅	SNP genotypes available — genomic selection feasible
Population size	⚠️	520 birds — adequate for sire model; expand for higher accuracy
Male BW records	❌	20 males lack body weight data

8.1 Feasible Analyses

Analysis	Feasibility	Software
Sire model BLUP	✅ Recommended	BLUPF90, ASReml
Animal model BLUP	✅ Feasible	BLUPF90, ASReml
Variance components (REML)	✅ Feasible	AIREMLF90, ASReml
Parent–offspring regression	✅ Feasible	R
Genetic trend analysis	✅ Feasible	R, BLUPF90
Multi-trait animal model	⚠️ Feasible (2–3 traits)	AIREMLF90
Repeatability model (BW)	✅ Feasible	BLUPF90
Selection index optimization	✅ Feasible	R (Smith-Hazel)
Single-step GBLUP (ssGBLUP)	✅ Feasible	BLUPF90+, PREGSF90
GBLUP / Bayesian methods	✅ Feasible	JWAS, BayesR, rrBLUP
GWAS	✅ Feasible	POSTGSF90, GEMMA

9 Expected Genetic Parameters

Based on literature for similar tropical poultry populations. These should be estimated from this data using REML.

Trait	Expected h²	Literature Range	Comment
Body weight (BW1–BW3)	0.25–0.45	0.20–0.55	Moderate-to-high; responds well to selection
Egg weight (EW1–EW3)	0.40–0.60	0.35–0.65	Among the most heritable poultry traits
Age at first egg (AFEGG)	0.25–0.40	0.20–0.45	Moderate heritability
Egg production (EP, TE)	0.10–0.25	0.05–0.30	Low-to-moderate; large environmental influence
Persistency (LP/PP)	0.05–0.15	0.03–0.20	Low; difficult to improve by selection alone

10 Genomic Selection

Genotype data (SNP markers) is available for this population, making genomic selection feasible. Combined with the multi-generation pedigree and phenotypic records, the following genomic analyses can be implemented:

Analysis	Method	Software
Single-step GBLUP	Combines A and G matrices via H-matrix	BLUPF90+ (PREGSF90)
GBLUP	G-matrix replaces A for genotyped animals	BLUPF90, rrBLUP, JWAS
Bayesian methods	BayesA, BayesB, BayesCπ, BayesR	JWAS (Julia), GenSel
GWAS	Single-SNP or window-based association	POSTGSF90, GEMMA
Cross-validation	Gen1 training, Gen2 validation	Custom R/Julia scripts

Implementation steps:

Quality control of genotype data (call rate, MAF, HWE, parent–offspring conflicts) → PLINK, PREGSF90
Build G-matrix and blend with pedigree A-matrix → PREGSF90
Estimate variance components with genomic information → AIREMLF90 (ssGBLUP)
Compute GEBVs for all traits → BLUPF90+
Cross-validate prediction accuracy → Custom scripts
Run GWAS to identify significant QTL regions → POSTGSF90
Construct optimized genomic selection index → R

11 Predicted Genetic Gain Scenarios

Genetic gain per generation: ΔG = i × r_TI × σ_A

# Use actual data for sigma_P
sigma_TE    <- desc_stats %>% filter(Trait == "TE") %>% pull(SD)
mean_TE     <- desc_stats %>% filter(Trait == "TE") %>% pull(Mean)
sigma_AFEGG <- desc_stats %>% filter(Trait == "AFEGG") %>% pull(SD)
mean_AFEGG  <- desc_stats %>% filter(Trait == "AFEGG") %>% pull(Mean)

# Scenario calculations
scenarios <- data.frame(
  Scenario = c(
    "1: Pedigree BLUP — TE (top 10% females)",
    "2: Pedigree BLUP — AFEGG (top 20%)",
    "3: Multi-trait index (TE + EW + AFEGG)",
    "4: Genomic selection — TE (ssGBLUP)"
  ),
  h2 = c(0.15, 0.30, NA, 0.15),
  i = c(1.76, 1.40, 1.40, 1.76),
  r_TI = c(0.45, 0.55, 0.50, 0.60),
  sigma_P = c(sigma_TE, sigma_AFEGG, sigma_TE, sigma_TE),
  stringsAsFactors = FALSE
) %>%
  mutate(
    sigma_A = ifelse(!is.na(h2), round(sqrt(h2) * sigma_P, 2), NA),
    Delta_G = ifelse(!is.na(sigma_A), round(i * r_TI * sigma_A, 1), NA)
  )

# Scenario 3: approximate multi-trait gain (reduced from single-trait by ~30%)
scenarios$Delta_G[3] <- round(scenarios$Delta_G[1] * 0.70, 1)
scenarios$sigma_A[3] <- NA

gain_display <- scenarios %>%
  select(Scenario, Delta_G) %>%
  mutate(
    Unit = c("eggs/gen", "days/gen", "eggs/gen (TE component)", "eggs/gen"),
    Delta_G = c(
      paste0("+", Delta_G[1]),
      paste0(Delta_G[2]),
      paste0("+", Delta_G[3]),
      paste0("+", Delta_G[4])
    )
  )

kable(gain_display, caption = "Predicted Genetic Gain per Generation",
      col.names = c("Scenario", "ΔG", "Unit")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Predicted Genetic Gain per Generation
Scenario	ΔG	Unit
1: Pedigree BLUP — TE (top 10% females)	+20.3	eggs/gen
2: Pedigree BLUP — AFEGG (top 20%)	4.3	days/gen
3: Multi-trait index (TE + EW + AFEGG)	+14.2	eggs/gen (TE component)
4: Genomic selection — TE (ssGBLUP)	+27.1	eggs/gen

if (nrow(gen_summary) == 2) {
  observed_TE_change <- g2$Mean_TE - g1$Mean_TE
  estimated_genetic  <- round(observed_TE_change / 2, 1)
  predicted_blup     <- scenarios$Delta_G[1]
  multiplier         <- round(as.numeric(predicted_blup) / estimated_genetic, 0)
  
  cat(paste0('\n\n<div class="success-box">\n',
             '**📈 Observed vs. Predicted:** The observed phenotypic trend is +',
             round(observed_TE_change, 1), ' eggs in TE (Gen1 → Gen2). ',
             'Assuming ~50% is genetic, the realized gain is ~', estimated_genetic,
             ' eggs/generation. Formal BLUP selection (predicted: +', predicted_blup,
             ' eggs) could multiply progress by approximately **', multiplier, 'x**.\n',
             '</div>\n\n'))
}

📈 Observed vs. Predicted: The observed phenotypic trend is +7.6 eggs in TE (Gen1 → Gen2). Assuming ~50% is genetic, the realized gain is ~3.8 eggs/generation. Formal BLUP selection (predicted: +20.3 eggs) could multiply progress by approximately 5x.

11.1 Realistic Timeline

Timeframe	Generations	Expected Cumulative Gain (TE)	Approach
Year 1–2	1	+12–18 eggs	Pedigree BLUP / ssGBLUP
Year 3–5	2–3	+25–45 eggs	ssGBLUP with expanded recording
Year 5–8	3–5	+50–80 eggs	Mature genomic selection program

12 Recommendations & Next Steps

12.1 Immediate (0–6 months)

12.1.1 Priority Actions

Run genetic evaluation (pedigree + genomic): Fit single-step GBLUP using BLUPF90 suite — PREGSF90 for genotype QC and G-matrix, AIREMLF90 for variance components, BLUPF90+ for GEBVs. Start with single-trait models for TE, AFEGG, EW, and BW.
Estimate genetic parameters: Use REML (AIREMLF90) to estimate heritabilities and genetic correlations. Validate with parent–offspring regression using the 4 sire and 20 dam links.
Estimate genetic trends: Compare mean EBVs/GEBVs between Gen1 and Gen2 to quantify realized genetic progress.
Record male body weights: Begin recording BW for all 20 males to improve sire EBV accuracy.
Investigate zero-production records: Check whether zero values in EP, LP, TE represent true non-producers, early culling, or data entry errors.

12.2 Medium-Term (6–18 months)

Develop a multi-trait selection index: Construct a balanced index with economic weights for TE, EW, BW, and AFEGG using Smith-Hazel methodology.
Expand recording: Add feed intake (FCR), egg quality traits, and livability data.
Increase population size: Target ≥ 1,000 birds per generation for robust parameter estimation and genomic prediction accuracy.

12.3 Long-Term (1–3 years)

Scale genomic selection: Genotype all selection candidates at hatch to maximize benefit of reduced generation interval.
Genetic dissemination: Develop a strategy for how improved genetics from AMO farm flow to multiplier and commercial flocks.

🐔 AMO Nigeria Poultry Breeding Program

Comprehensive Data Assessment & Breeding Strategy Report

Setegn Worku Alemu — International Livestock Research Institute (ILRI)

February 11, 2026