1 Introduction

1.1 Project Overview

🎯 Project Context: The Tropical Dairy Genetic Gains (TDGG) project aims to establish sustainable genetic improvement programs for smallholder dairy systems in East Africa, funded by the Bill & Melinda Gates Foundation.

1.1.1 Study Objectives

Characterize phenotypic distributions of key dairy traits
Quantify fixed effects (Region, Breed, Parity, DIM)
Prepare data for genetic evaluation using BLUPF90

1.2 Traits Analyzed

Economic traits under genetic evaluation
Trait	Unit	Heritability	Importance
Age at First Calving (AFC)	months	0.05 - 0.15	Reproductive efficiency
Daily Milk Yield (DMY)	kg/day	0.15 - 0.30	Daily productivity
Total Lactation Yield (TLY)	kg/lactation	0.20 - 0.35	Total production
Calving Interval (CI)	days	0.02 - 0.06	Reproductive interval

2 Data Loading

2.1 Load Data

# ==============================================================================
# SET YOUR DATA PATH HERE
# ==============================================================================
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/Version2/"
output_path <- paste0(data_path, "BLUPF90CL/")

if(!dir.exists(output_path)) dir.create(output_path, recursive = TRUE)

cat("═══════════════════════════════════════════════════════════════════════════\n")

═══════════════════════════════════════════════════════════════════════════

cat("                     LOADING KENYA DAIRY DATA FILES                         \n")

                     LOADING KENYA DAIRY DATA FILES

cat("═══════════════════════════════════════════════════════════════════════════\n\n")

═══════════════════════════════════════════════════════════════════════════

# Pedigree data
pedigree <- fread(paste0(data_path, "pedigree-extract.csv"),
                  na.strings = c("NULL", "null", "NA", "N/A", "", " "),
                  stringsAsFactors = FALSE)

# Lactation data
lactation <- fread(paste0(data_path, "lactation_calving_improved(in).csv"),
                   na.strings = c("NULL", "null", "NA", "N/A", "", " "),
                   stringsAsFactors = FALSE)

# Milking data
milking <- fread(paste0(data_path, "milking_data_improved.csv"),
                 na.strings = c("NULL", "null", "NA", "N/A", "", " "),
                 stringsAsFactors = FALSE)

cat("✓ Pedigree:", format(nrow(pedigree), big.mark = ","), "records\n")

✓ Pedigree: 248,208 records

cat("✓ Lactation:", format(nrow(lactation), big.mark = ","), "records\n")

✓ Lactation: 88,352 records

cat("✓ Milking:", format(nrow(milking), big.mark = ","), "records\n\n")

✓ Milking: 1,048,575 records

# Show column names for debugging
cat("Pedigree columns:", paste(names(pedigree), collapse = ", "), "\n\n")

Pedigree columns: country, region, district, ward, village, farmer_name, farm_id, org_id, organization_name, project, animal_id, tag_id, original_tag_id, sire_tag_id, sire_id, dam_tag_id, dam_id, sex, estimated_sex, reg_date, birthdate, main_breed, breed, longitude, latitude, warning, error, Extract_Datetime_UTC, Version

cat("Lactation columns:", paste(names(lactation), collapse = ", "), "\n\n")

Lactation columns: farm_id, project, animal_id, tag_id, original_tag_id, animal_name, animal_registration_date, birthdate, sex, animal_sex, animal_type, animal_type_name, main_breed, main_breed_name, breed_composition, breed_proportion, secondary_breed_id, secondary_breed_name, secondary_breed_other, lactation_id, calving_date, parity, afc_months, afc_days, calving_interval_days, calving_interval_months, lactation_length_days, lactation_length_months, total_lactation_yield, testday_count, days_open, org_id, project_name, V34

cat("Milking columns:", paste(names(milking), collapse = ", "), "\n")

Milking columns: farm_id, project, animal_id, tag_id, original_tag_id, animal_name, animal_registration_date, birthdate, sex, animal_sex, main_breed, main_breed_name, breed_composition, breed_proportion, secondary_breed_id, secondary_breed_name, secondary_breed_other, lactation_id, calving_date, parity, afc_months, afc_days, calving_age_months, calving_age_days, milk_event_id, milking_date, testday_no, dim, milk_morning, milk_evening, milk_midday, daily_yield, data_completeness_score, org_id, project_name, record_created_date

2.2 Data Structure

head(pedigree, 5) %>%
  select(any_of(c("animal_id", "sire_id", "dam_id", "sex", "breed", "main_breed", 
                  "region", "district", "farm_id", "country"))) %>%
  styled_table(caption = "Sample pedigree records")

Sample pedigree records
animal_id	sire_id	dam_id	sex	breed	main_breed	region	district	farm_id	country
170285	NA	NA	FEMALE	Ayrshire	3	Migori	Kuria West	515160	Kenya
170298	0	NA	FEMALE	Ayrshire	3	Kakamega	Likuyani	515626	Kenya
170315	NA	NA	FEMALE	Ayrshire	3	Kakamega	Lugari	598316	Kenya
170328	NA	NA	FEMALE	Ayrshire	3	Kakamega	Likuyani	595412	Kenya
170506	NA	NA	FEMALE	Local Zebu	26	Homa Bay	Ndhiwa	514950	Kenya

3 Pedigree Analysis

3.1 Pedigree Statistics

n_animals <- nrow(pedigree)
n_sires <- n_distinct(pedigree$sire_id[!is.na(pedigree$sire_id)])
n_dams <- n_distinct(pedigree$dam_id[!is.na(pedigree$dam_id)])
n_founders <- sum(is.na(pedigree$sire_id) & is.na(pedigree$dam_id))
pct_known_sire <- mean(!is.na(pedigree$sire_id)) * 100
pct_known_dam <- mean(!is.na(pedigree$dam_id)) * 100

data.frame(
  Statistic = c("Total Animals", "Unique Sires", "Unique Dams", 
                "Founders", "Known Sire (%)", "Known Dam (%)"),
  Value = c(format(n_animals, big.mark = ","), format(n_sires, big.mark = ","),
            format(n_dams, big.mark = ","), format(n_founders, big.mark = ","),
            paste0(round(pct_known_sire, 1), "%"), paste0(round(pct_known_dam, 1), "%"))
) %>% styled_table(caption = "Pedigree completeness statistics")

Pedigree completeness statistics
Statistic	Value
Total Animals	248,208
Unique Sires	8,415
Unique Dams	44,062
Founders	151,283
Known Sire (%)	32.1%
Known Dam (%)	28.7%

3.2 Geographic Distribution

# Create summary data
region_summary <- pedigree %>%
  filter(!is.na(region)) %>%
  group_by(region) %>%
  summarise(
    Animals = n(), 
    Farms = n_distinct(farm_id), 
    Districts = n_distinct(district), 
    .groups = 'drop'
  ) %>%
  arrange(desc(Animals)) %>%
  mutate(
    Percentage = round(Animals / sum(Animals) * 100, 1),
    # Create label with region name and count
    label_animals = paste0(region, ": ", format(Animals, big.mark = ","), " (", Percentage, "%)"),
    label_farms = paste0(region, ": ", format(Farms, big.mark = ","))
  )

# Display as a nice table first
region_summary %>%
  select(Region = region, Animals, `%` = Percentage, Farms, Districts) %>%
  mutate(Animals = format(Animals, big.mark = ","),
         Farms = format(Farms, big.mark = ",")) %>%
  styled_table(caption = "Geographic distribution summary")

Geographic distribution summary
Region	Animals	%	Farms	Districts
Nakuru	56,756	27.9	1,680	12
Makueni	15,293	7.5	3,265	8
Trans Nzoia	14,302	7.0	321	5
Uasin Gishu	13,297	6.5	1,462	7
Kiambu	11,948	5.9	948	11
Nairobi	11,849	5.8	489	10
Nyeri	10,100	5.0	1,475	6
Laikipia	9,590	4.7	329	3
Muranga	6,749	3.3	1,708	8
Nyandarua	5,973	2.9	1,100	8
Meru	5,710	2.8	1,131	6
Kisumu	4,290	2.1	1,382	8
Narok	4,251	2.1	117	6
Kakamega	4,028	2.0	1,277	12
Nandi County	2,484	1.2	549	5
Siaya	2,442	1.2	1,068	7
Kajiado	2,129	1.0	138	5
Kilifi	1,738	0.9	8	3
Bungoma	1,736	0.9	512	9
Embu	1,603	0.8	209	3
Taita Taveta	1,587	0.8	181	6
Homa Bay	1,584	0.8	838	9
Busia	1,553	0.8	720	8
Migori	1,504	0.7	825	10
Kisii	1,240	0.6	273	10
Baringo	1,237	0.6	101	6
Bomet	1,002	0.5	193	6
Tharaka Nithi	981	0.5	200	3
Vihiga	969	0.5	431	6
Kitui	862	0.4	516	9
Elgeyo Marakwet	823	0.4	85	4
Machakos	800	0.4	108	6
Kericho	788	0.4	95	5
Kirinyaga	739	0.4	84	4
Nyamira	667	0.3	196	5
Mombasa	400	0.2	7	1
West Pokot	118	0.1	9	2
Marsabit	58	0.0	14	2
Kwale	45	0.0	2	2
Oromia	18	0.0	1	1
Mandera	8	0.0	1	1
Tana River	6	0.0	3	1
Amhara	5	0.0	3	2
Kolero Ward	3	0.0	1	1
Osun	1	0.0	1	1

# Side-by-side horizontal bar plots with values displayed inside bars
p1 <- ggplot(region_summary, aes(x = reorder(region, Animals), y = Animals)) +

  geom_col(aes(fill = Animals), width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(format(Animals, big.mark = ","), " (", Percentage, "%)")), 
            hjust = 1.1, size = 4, fontface = "bold", color = "white") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05)), labels = comma) +
  scale_fill_gradient(low = "#3B82F6", high = "#1e3a8a") +
  coord_flip() +
  labs(
    title = "Animals by Region",
    x = NULL, 
    y = "Number of Animals"
  ) +
  theme(
    axis.text.y = element_text(size = 11, face = "bold"),
    plot.title = element_text(size = 14)
  )

p2 <- ggplot(region_summary, aes(x = reorder(region, Farms), y = Farms)) +
  geom_col(aes(fill = Farms), width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = format(Farms, big.mark = ",")), 
            hjust = 1.1, size = 4, fontface = "bold", color = "white") +
  scale_fill_gradient(low = "#10B981", high = "#065f46") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  coord_flip() +
  labs(
    title = "Farms by Region",
    x = NULL, 
    y = "Number of Farms"
  ) +
  theme(
    axis.text.y = element_text(size = 11, face = "bold"),
    plot.title = element_text(size = 14)
  )

p1 + p2 + plot_annotation(
  title = "Geographic Coverage of Kenya Dairy Genetic Evaluation",
  theme = theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
)

4 Interactive Maps

4.1 Regional Map

🗺️ Interactive Visualization: Explore the geographic distribution of dairy cattle and farms across Kenya’s counties. Click on markers for detailed statistics.

# ==============================================================================
# KENYA COUNTY COORDINATES
# ==============================================================================
# Approximate centroids for Kenya's major dairy regions
# You may need to adjust these based on your actual region names in the data

kenya_coords <- data.frame(
  region = c(
    # Major dairy regions (Rift Valley & Central)
    "Nakuru", "Nyandarua", "Kiambu", "Nyeri", "Murang'a", "Kirinyaga",
    "Laikipia", "Narok", "Baringo", "Uasin Gishu", "Trans Nzoia", "Elgeyo Marakwet",
    "Nandi", "Kericho", "Bomet", "West Pokot",
    # Western Kenya
    "Kakamega", "Bungoma", "Vihiga", "Busia",
    # Nyanza
    "Kisii", "Nyamira", "Kisumu", "Siaya", "Homa Bay", "Migori",
    # Eastern
    "Meru", "Embu", "Tharaka-Nithi", "Machakos", "Makueni", "Kitui",
    # Coast
    "Kilifi", "Kwale", "Mombasa", "Taita Taveta",
    # Nairobi & surroundings
    "Nairobi", "Kajiado"
  ),
  lat = c(
    -0.2833, -0.4000, -1.0000, -0.4167, -0.7833, -0.5000,
    0.2000, -1.1000, 0.4667, 0.5167, 1.0000, 0.9500,
    0.1833, -0.3667, -0.7833, 1.6167,
    0.2833, 0.5667, 0.0500, 0.4500,
    -0.6833, -0.5667, -0.1000, 0.0617, -0.5167, -1.0667,
    0.0500, -0.5333, -0.3000, -1.5167, -1.8000, -1.3667,
    -3.5000, -4.1833, -4.0500, -3.4000,
    -1.2833, -1.8500
  ),
  lng = c(
    36.0667, 36.4333, 36.8333, 37.0000, 37.0000, 37.3333,
    36.7833, 35.8667, 35.9667, 35.2833, 35.0000, 35.5000,
    35.1833, 35.2833, 35.3333, 35.1000,
    34.7500, 34.5667, 34.7333, 34.1333,
    34.7667, 34.9333, 34.7500, 34.2500, 34.4500, 34.4667,
    37.6500, 37.4500, 37.8500, 37.2667, 37.6167, 38.0000,
    39.8500, 39.4500, 39.6667, 38.3500,
    36.8167, 36.7833
  ),
  stringsAsFactors = FALSE
)

# Merge with region summary data
map_data <- region_summary %>%
  left_join(kenya_coords, by = "region") %>%
  filter(!is.na(lat), !is.na(lng))

# Check for unmatched regions
unmatched <- region_summary %>%
  anti_join(kenya_coords, by = "region")

if(nrow(unmatched) > 0) {
  cat("Note: The following regions could not be mapped (coordinates not found):\n")
  cat(paste(unmatched$region, collapse = ", "), "\n")
  cat("You may need to add their coordinates to the kenya_coords dataframe.\n\n")
}

Note: The following regions could not be mapped (coordinates not found):
Muranga, Nandi County, Tharaka Nithi, Marsabit, Oromia, Mandera, Tana River, Amhara, Kolero Ward, Osun 
You may need to add their coordinates to the kenya_coords dataframe.

# ==============================================================================
# INTERACTIVE BUBBLE MAP - Animals by Region
# ==============================================================================

if(nrow(map_data) > 0) {
  
  # Create color palette based on number of animals
  pal <- colorNumeric(
    palette = c("#3B82F6", "#1e40af", "#1e3a8a"),
    domain = map_data$Animals
  )
  
  # Scale bubble size (sqrt transformation for better visual scaling)
  map_data <- map_data %>%
    mutate(
      radius = sqrt(Animals) / max(sqrt(Animals)) * 40 + 5,
      popup_content = paste0(
        "<div style='font-family: sans-serif; padding: 10px;'>",
        "<h4 style='color: #1a1f36; margin: 0 0 10px 0; border-bottom: 2px solid #D4AF37; padding-bottom: 5px;'>",
        "<strong>", region, "</strong></h4>",
        "<table style='width: 100%; font-size: 13px;'>",
        "<tr><td style='padding: 3px 10px 3px 0;'>🐄 Animals:</td><td style='font-weight: bold; color: #3B82F6;'>", 
        format(Animals, big.mark = ","), "</td></tr>",
        "<tr><td style='padding: 3px 10px 3px 0;'>🏠 Farms:</td><td style='font-weight: bold; color: #10B981;'>", 
        format(Farms, big.mark = ","), "</td></tr>",
        "<tr><td style='padding: 3px 10px 3px 0;'>📍 Districts:</td><td style='font-weight: bold; color: #7C3AED;'>", 
        Districts, "</td></tr>",
        "<tr><td style='padding: 3px 10px 3px 0;'>📊 Share:</td><td style='font-weight: bold; color: #F97316;'>", 
        Percentage, "%</td></tr>",
        "</table></div>"
      )
    )
  
  # Create the map
  leaflet(map_data, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Positron", group = "Light") %>%
    addProviderTiles("Esri.WorldImagery", group = "Satellite") %>%
    addProviderTiles("OpenStreetMap", group = "OpenStreetMap") %>%
    addProviderTiles("Esri.WorldTopoMap", group = "Topographic") %>%
    
    # Add circle markers
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = ~radius,
      color = "#1e3a8a",
      weight = 2,
      opacity = 0.9,
      fillColor = ~pal(Animals),
      fillOpacity = 0.7,
      popup = ~popup_content,
      label = ~paste0(region, ": ", format(Animals, big.mark = ","), " animals"),
      labelOptions = labelOptions(
        style = list("font-weight" = "bold", "font-size" = "12px"),
        textsize = "12px",
        direction = "auto"
      ),
      group = "Animals"
    ) %>%
    
    # Add legend
    addLegend(
      position = "bottomright",
      pal = pal,
      values = ~Animals,
      title = "Number of<br>Animals",
      opacity = 0.9,
      labFormat = labelFormat(big.mark = ",")
    ) %>%
    
    # Add layer controls
    addLayersControl(
      baseGroups = c("Light", "Satellite", "OpenStreetMap", "Topographic"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    # Add scale bar
    addScaleBar(position = "bottomleft") %>%
    
    # Set default view to Kenya
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    # Add title
    addControl(
      html = "<div style='background: white; padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>🐄 Kenya Dairy Cattle Distribution</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Click markers for detailed statistics</span></div>",
      position = "topright"
    )
  
} else {
  cat("No regions could be mapped. Please check that region names in your data match the coordinate lookup table.\n")
}

4.2 Farm Density Heatmap

# ==============================================================================
# HEATMAP VISUALIZATION - Farm Density
# ==============================================================================

if(nrow(map_data) > 0) {
  
  # Create expanded points for heatmap effect (simulate density)
  # Weight by number of farms
  heatmap_points <- map_data %>%
    slice(rep(1:n(), times = ceiling(Farms/100))) %>%
    mutate(
      # Add small random jitter for visual effect
      lat_jitter = lat + runif(n(), -0.15, 0.15),
      lng_jitter = lng + runif(n(), -0.15, 0.15)
    )
  
  leaflet(width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.DarkMatter", group = "Dark") %>%
    addProviderTiles("CartoDB.Positron", group = "Light") %>%
    
    # Add heatmap layer
    addHeatmap(
      data = heatmap_points,
      lng = ~lng_jitter, lat = ~lat_jitter,
      intensity = ~Farms/max(Farms),
      blur = 25,
      max = 0.8,
      radius = 30,
      gradient = c(
        "0" = "transparent",
        "0.2" = "#3B82F6",
        "0.4" = "#10B981", 
        "0.6" = "#F97316",
        "0.8" = "#EF4444",
        "1" = "#DC2626"
      ),
      group = "Heatmap"
    ) %>%
    
    # Add region labels
    addLabelOnlyMarkers(
      data = map_data,
      lng = ~lng, lat = ~lat,
      label = ~region,
      labelOptions = labelOptions(
        noHide = TRUE,
        direction = "center",
        textOnly = TRUE,
        style = list(
          "color" = "white",
          "font-weight" = "bold",
          "font-size" = "11px",
          "text-shadow" = "1px 1px 2px black"
        )
      ),
      group = "Labels"
    ) %>%
    
    addLayersControl(
      baseGroups = c("Dark", "Light"),
      overlayGroups = c("Heatmap", "Labels"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: rgba(26,31,54,0.9); padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.3); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: white;'>🔥 Farm Density Heatmap</strong><br>
              <span style='font-size: 12px; color: #9CA3AF;'>Intensity reflects concentration of dairy farms</span></div>",
      position = "topright"
    )
}

4.3 Clustered Markers

# ==============================================================================
# CLUSTERED MARKERS - Interactive Farm Exploration
# ==============================================================================

if(nrow(map_data) > 0) {
  
  # Assign colors based on animal count
  map_data <- map_data %>%
    mutate(
      marker_color = case_when(
        Animals > 10000 ~ "#DC2626",
        Animals > 5000 ~ "#F97316",
        Animals > 1000 ~ "#0891B2",
        TRUE ~ "#3B82F6"
      )
    )
  
  leaflet(map_data, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Voyager", group = "Voyager") %>%
    addProviderTiles("Esri.WorldStreetMap", group = "Streets") %>%
    addProviderTiles("Esri.WorldTopoMap", group = "Terrain") %>%
    
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = 12,
      color = ~marker_color,
      weight = 2,
      opacity = 1,
      fillColor = ~marker_color,
      fillOpacity = 0.7,
      popup = ~popup_content,
      label = ~paste0(region, " (", format(Animals, big.mark = ","), ")"),
      clusterOptions = markerClusterOptions(
        iconCreateFunction = JS("
          function(cluster) {
            var childCount = cluster.getChildCount();
            var c = ' marker-cluster-';
            if (childCount < 5) {
              c += 'small';
            } else if (childCount < 10) {
              c += 'medium';
            } else {
              c += 'large';
            }
            return new L.DivIcon({ 
              html: '<div><span>' + childCount + '</span></div>', 
              className: 'marker-cluster' + c, 
              iconSize: new L.Point(40, 40) 
            });
          }
        ")
      )
    ) %>%
    
    addLayersControl(
      baseGroups = c("Voyager", "Streets", "Terrain"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: white; padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>📍 Regional Clusters</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Click clusters to zoom, markers for details</span><br>
              <div style='margin-top: 8px; font-size: 11px;'>
              <span style='color: #DC2626;'>●</span> >10,000 &nbsp;
              <span style='color: #F97316;'>●</span> 5,000-10,000 &nbsp;
              <span style='color: #0891B2;'>●</span> 1,000-5,000 &nbsp;
              <span style='color: #3B82F6;'>●</span> <1,000
              </div></div>",
      position = "topright"
    )
}

4.4 Trait Performance Map

# ==============================================================================
# PREPARE TRAIT DATA BY REGION FOR MAPPING
# (This section will be populated after phenotype data is processed)
# ==============================================================================

# Note: This chunk prepares the trait summary data for mapping
# It will be displayed after the phenotypic data preparation section below

5 Breed Composition

# Determine which breed column exists
breed_col <- if("breed" %in% names(pedigree)) "breed" else if("main_breed" %in% names(pedigree)) "main_breed" else NULL

if(!is.null(breed_col)) {
  breed_summary <- pedigree %>%
    filter(!is.na(.data[[breed_col]])) %>%
    count(breed_val = .data[[breed_col]], name = "Count") %>%
    arrange(desc(Count)) %>%
    head(12) %>%
    mutate(Percentage = round(Count / sum(Count) * 100, 1))
  
  # Show table first
  breed_summary %>%
    rename(Breed = breed_val, Animals = Count, `%` = Percentage) %>%
    mutate(Animals = format(Animals, big.mark = ",")) %>%
    styled_table(caption = "Breed composition of study population")
}

Breed composition of study population
Breed	Animals	%
Holstein Friesian	109,158	51.2
Ayrshire	32,484	15.2
Sahiwal	20,880	9.8
Boran	18,976	8.9
Jersey	8,631	4.1
Guernsey	6,293	3.0
Holstein-Friesian crosses	3,726	1.7
Unknown	3,484	1.6
Others	3,341	1.6
Simmental	2,824	1.3
Local Zebu	2,169	1.0
Sahiwal crosses	1,143	0.5

if(!is.null(breed_col)) {
  breed_summary <- breed_summary %>%
    mutate(breed_val = factor(breed_val, levels = rev(breed_val)))
  
  ggplot(breed_summary, aes(x = breed_val, y = Count)) +
    geom_col(aes(fill = Count), width = 0.7, show.legend = FALSE) +
    geom_text(aes(label = paste0(format(Count, big.mark = ","), " (", Percentage, "%)")),
              hjust = 1.1, size = 3.8, fontface = "bold", color = "white") +
    scale_fill_gradient(low = "#F97316", high = "#9a3412") +
    scale_y_continuous(expand = expansion(mult = c(0, 0.05)), labels = comma) +
    coord_flip() +
    labs(
      title = "Breed Composition of Study Population",
      subtitle = paste("Total:", format(sum(breed_summary$Count), big.mark = ","), "animals"),
      x = NULL, 
      y = "Number of Animals"
    ) +
    theme(
      axis.text.y = element_text(size = 10, face = "bold"),
      plot.title = element_text(size = 14)
    )
}

6 Phenotypic Data Preparation

6.1 Data Filtering

# ==============================================================================
# PREPARE LOOKUP TABLE FROM PEDIGREE (region only - farm_id is in phenotype files)
# ==============================================================================

# Determine breed column in pedigree
breed_col_ped <- if("breed" %in% names(pedigree)) "breed" else "main_breed"

pedigree_lookup <- pedigree %>%
  mutate(animal_id = as.character(animal_id)) %>%
  select(animal_id, region, district, any_of(breed_col_ped)) %>%
  rename(breed_ped = any_of(breed_col_ped)) %>%
  group_by(animal_id) %>%
  slice(1) %>%
  ungroup()

cat("Pedigree lookup created with", nrow(pedigree_lookup), "animals\n\n")

Pedigree lookup created with 248208 animals

# ==============================================================================
# AGE AT FIRST CALVING (AFC) DATA
# ==============================================================================
cat("Processing AFC data...\n")

Processing AFC data...

afc_data <- lactation %>%
  mutate(
    animal_id = as.character(animal_id),
    afc_months = as.numeric(afc_months),
    parity = as.numeric(parity)
  ) %>%
  filter(
    parity == 1,
    !is.na(afc_months),
    afc_months >= 18,
    afc_months <= 72
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    # Use main_breed_name from lactation if available, otherwise from pedigree
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  group_by(animal_id) %>%
  slice(1) %>%
  ungroup() %>%
  mutate(
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("AFC records after filtering:", format(nrow(afc_data), big.mark = ","), "\n")

AFC records after filtering: 27,503

# ==============================================================================
# DAILY MILK YIELD (TEST-DAY) DATA
# ==============================================================================
cat("Processing Daily Milk data...\n")

Processing Daily Milk data...

daily_milk_data <- milking %>%
  mutate(
    animal_id = as.character(animal_id),
    daily_yield = as.numeric(daily_yield),
    dim = as.numeric(dim),
    parity = as.numeric(parity)
  ) %>%
  filter(
    !is.na(daily_yield),
    daily_yield > 0.5,
    daily_yield < 60,
    !is.na(dim),
    dim >= 5,
    dim <= 500
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  mutate(
    parity_class = factor(ifelse(is.na(parity) | parity < 1, 1, ifelse(parity > 4, 4, parity))),
    dim_class = factor(ifelse(ceiling(dim / 30) > 17, 17, ceiling(dim / 30))),
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("Daily Milk records after filtering:", format(nrow(daily_milk_data), big.mark = ","), "\n")

Daily Milk records after filtering: 582,060

# ==============================================================================
# TOTAL LACTATION YIELD (TLY) DATA
# ==============================================================================
cat("Processing TLY data...\n")

Processing TLY data...

tly_data <- lactation %>%
  mutate(
    animal_id = as.character(animal_id),
    total_lactation_yield = as.numeric(total_lactation_yield),
    parity = as.numeric(parity)
  ) %>%
  filter(
    !is.na(total_lactation_yield),
    total_lactation_yield >= 100,
    total_lactation_yield <= 20000
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  mutate(
    parity_class = factor(ifelse(is.na(parity) | parity < 1, 1, ifelse(parity > 4, 4, parity))),
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("TLY records after filtering:", format(nrow(tly_data), big.mark = ","), "\n")

TLY records after filtering: 9,558

# ==============================================================================
# CALVING INTERVAL (CI) DATA
# ==============================================================================
cat("Processing CI data...\n")

Processing CI data...

ci_data <- lactation %>%
  mutate(
    animal_id = as.character(animal_id),
    calving_interval_days = as.numeric(calving_interval_days),
    parity = as.numeric(parity)
  ) %>%
  filter(
    parity > 1,
    !is.na(calving_interval_days),
    calving_interval_days >= 280,
    calving_interval_days <= 900
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  mutate(
    parity_class = factor(ifelse(parity > 4, 4, parity)),
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("CI records after filtering:", format(nrow(ci_data), big.mark = ","), "\n\n")

CI records after filtering: 44,049

# ==============================================================================
# SUMMARY TABLE
# ==============================================================================
data.frame(
  Trait = c("Age at First Calving", "Daily Milk Yield", "Total Lactation Yield", "Calving Interval"),
  Abbrev = c("AFC", "DMY", "TLY", "CI"),
  Records = c(nrow(afc_data), nrow(daily_milk_data), nrow(tly_data), nrow(ci_data)),
  Animals = c(n_distinct(afc_data$animal_id), n_distinct(daily_milk_data$animal_id),
              n_distinct(tly_data$animal_id), n_distinct(ci_data$animal_id)),
  Farms = c(n_distinct(afc_data$farm_id), n_distinct(daily_milk_data$farm_id),
            n_distinct(tly_data$farm_id), n_distinct(ci_data$farm_id)),
  Filter = c("Parity=1, 18-72 mo", "0.5-60 kg, DIM 5-500", "100-20,000 kg", "Parity≥2, 280-900 d")
) %>%
  mutate(Records = format(Records, big.mark = ","),
         Animals = format(Animals, big.mark = ","),
         Farms = format(Farms, big.mark = ",")) %>%
  styled_table(caption = "Data available after quality control filtering")

Data available after quality control filtering
Trait	Abbrev	Records	Animals	Farms	Filter
Age at First Calving	AFC	27,503	27,503	1,239	Parity=1, 18-72 mo
Daily Milk Yield	DMY	582,060	15,255	647	0.5-60 kg, DIM 5-500
Total Lactation Yield	TLY	9,558	4,931	354	100-20,000 kg
Calving Interval	CI	44,049	17,527	440	Parity≥2, 280-900 d

7 Interactive Trait Performance Map

7.1 Daily Milk by Region

# ==============================================================================
# TRAIT PERFORMANCE MAP - Daily Milk Yield by Region
# ==============================================================================

# Calculate regional trait means
trait_by_region <- daily_milk_data %>%
  group_by(region) %>%
  summarise(
    n_records = n(),
    n_animals = n_distinct(animal_id),
    mean_milk = round(mean(daily_yield, na.rm = TRUE), 2),
    sd_milk = round(sd(daily_yield, na.rm = TRUE), 2),
    .groups = 'drop'
  ) %>%
  mutate(region = as.character(region))

# Merge with coordinates
trait_map_data <- trait_by_region %>%
  left_join(kenya_coords, by = "region") %>%
  filter(!is.na(lat), !is.na(lng))

if(nrow(trait_map_data) > 0) {
  
  # Color palette for milk yield
  milk_pal <- colorNumeric(
    palette = c("#FEE2E2", "#FECACA", "#FCA5A5", "#F87171", "#EF4444", "#DC2626", "#B91C1C"),
    domain = trait_map_data$mean_milk
  )
  
  # Create popup content
  trait_map_data <- trait_map_data %>%
    mutate(
      popup_trait = paste0(
        "<div style='font-family: sans-serif; padding: 10px; min-width: 200px;'>",
        "<h4 style='color: #1a1f36; margin: 0 0 10px 0; border-bottom: 2px solid #3B82F6; padding-bottom: 5px;'>",
        "<strong>", region, "</strong></h4>",
        "<div style='background: #EFF6FF; padding: 10px; border-radius: 8px; margin-bottom: 10px;'>",
        "<span style='font-size: 24px; font-weight: bold; color: #1e40af;'>", mean_milk, "</span>",
        "<span style='font-size: 14px; color: #3B82F6;'> kg/day</span>",
        "</div>",
        "<table style='width: 100%; font-size: 12px;'>",
        "<tr><td>± SD:</td><td style='font-weight: bold;'>", sd_milk, " kg</td></tr>",
        "<tr><td>Records:</td><td style='font-weight: bold;'>", format(n_records, big.mark = ","), "</td></tr>",
        "<tr><td>Animals:</td><td style='font-weight: bold;'>", format(n_animals, big.mark = ","), "</td></tr>",
        "</table></div>"
      )
    )
  
  leaflet(trait_map_data, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Positron", group = "Light") %>%
    addProviderTiles("Esri.WorldGrayCanvas", group = "Gray") %>%
    
    # Choropleth-style circles
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = ~sqrt(n_animals) / max(sqrt(n_animals)) * 35 + 8,
      color = "#1e40af",
      weight = 2,
      opacity = 0.8,
      fillColor = ~milk_pal(mean_milk),
      fillOpacity = 0.8,
      popup = ~popup_trait,
      label = ~paste0(region, ": ", mean_milk, " kg/day")
    ) %>%
    
    # Add labels with values
    addLabelOnlyMarkers(
      lng = ~lng, lat = ~lat,
      label = ~paste0(round(mean_milk, 1)),
      labelOptions = labelOptions(
        noHide = TRUE,
        direction = "center",
        textOnly = TRUE,
        style = list(
          "color" = "#1e3a8a",
          "font-weight" = "bold",
          "font-size" = "11px"
        )
      )
    ) %>%
    
    addLegend(
      position = "bottomright",
      pal = milk_pal,
      values = ~mean_milk,
      title = "Daily Milk<br>(kg/day)",
      opacity = 0.9
    ) %>%
    
    addLayersControl(
      baseGroups = c("Light", "Gray"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: white; padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>🥛 Daily Milk Yield by Region</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Circle size = # animals; Color = yield</span></div>",
      position = "topright"
    )
}

7.2 Multi-Trait Comparison

# ==============================================================================
# MULTI-TRAIT COMPARISON MAP
# ==============================================================================

# Calculate all trait means by region
all_traits_by_region <- bind_rows(
  afc_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(afc_months), n = n(), .groups = 'drop') %>%
    mutate(trait = "AFC", unit = "months"),
  
  daily_milk_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(daily_yield), n = n(), .groups = 'drop') %>%
    mutate(trait = "Daily Milk", unit = "kg/day"),
  
  tly_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(total_lactation_yield), n = n(), .groups = 'drop') %>%
    mutate(trait = "Lactation Yield", unit = "kg"),
  
  ci_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(calving_interval_days), n = n(), .groups = 'drop') %>%
    mutate(trait = "Calving Interval", unit = "days")
) %>%
  mutate(region = as.character(region))

# Pivot to wide format for popup
traits_wide <- all_traits_by_region %>%
  select(region, trait, mean_val) %>%
  pivot_wider(names_from = trait, values_from = mean_val) %>%
  left_join(kenya_coords, by = "region") %>%
  filter(!is.na(lat), !is.na(lng))

if(nrow(traits_wide) > 0) {
  
  # Create comprehensive popup
  traits_wide <- traits_wide %>%
    mutate(
      popup_all = paste0(
        "<div style='font-family: sans-serif; padding: 12px; min-width: 220px;'>",
        "<h4 style='color: #1a1f36; margin: 0 0 12px 0; border-bottom: 3px solid #D4AF37; padding-bottom: 8px;'>",
        region, "</h4>",
        
        "<div style='display: grid; grid-template-columns: 1fr 1fr; gap: 8px;'>",
        
        # AFC
        "<div style='background: linear-gradient(135deg, #FFF7ED, #FFEDD5); padding: 10px; border-radius: 8px; border-left: 3px solid #F97316;'>",
        "<div style='font-size: 11px; color: #9A3412;'>AFC</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #C2410C;'>", 
        ifelse(is.na(AFC), "N/A", paste0(round(AFC, 1), " mo")), "</div></div>",
        
        # Daily Milk
        "<div style='background: linear-gradient(135deg, #EFF6FF, #DBEAFE); padding: 10px; border-radius: 8px; border-left: 3px solid #3B82F6;'>",
        "<div style='font-size: 11px; color: #1E40AF;'>Daily Milk</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #1D4ED8;'>", 
        ifelse(is.na(`Daily Milk`), "N/A", paste0(round(`Daily Milk`, 1), " kg")), "</div></div>",
        
        # Lactation Yield
        "<div style='background: linear-gradient(135deg, #ECFDF5, #D1FAE5); padding: 10px; border-radius: 8px; border-left: 3px solid #10B981;'>",
        "<div style='font-size: 11px; color: #065F46;'>Lactation</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #047857;'>", 
        ifelse(is.na(`Lactation Yield`), "N/A", paste0(format(round(`Lactation Yield`, 0), big.mark = ","), " kg")), "</div></div>",
        
        # Calving Interval
        "<div style='background: linear-gradient(135deg, #F5F3FF, #EDE9FE); padding: 10px; border-radius: 8px; border-left: 3px solid #7C3AED;'>",
        "<div style='font-size: 11px; color: #5B21B6;'>CI</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #6D28D9;'>", 
        ifelse(is.na(`Calving Interval`), "N/A", paste0(round(`Calving Interval`, 0), " d")), "</div></div>",
        
        "</div></div>"
      )
    )
  
  leaflet(traits_wide, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Voyager", group = "Voyager") %>%
    addProviderTiles("Esri.WorldGrayCanvas", group = "Gray") %>%
    
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = 18,
      color = "#D4AF37",
      weight = 3,
      opacity = 1,
      fillColor = "#1a1f36",
      fillOpacity = 0.85,
      popup = ~popup_all,
      label = ~region
    ) %>%
    
    addLabelOnlyMarkers(
      lng = ~lng, lat = ~lat,
      label = ~region,
      labelOptions = labelOptions(
        noHide = TRUE,
        direction = "bottom",
        textOnly = TRUE,
        offset = c(0, 15),
        style = list(
          "color" = "#1a1f36",
          "font-weight" = "bold",
          "font-size" = "10px"
        )
      )
    ) %>%
    
    addLayersControl(
      baseGroups = c("Voyager", "Gray"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: white; padding: 12px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>📊 Multi-Trait Regional Performance</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Click markers to compare all traits</span></div>",
      position = "topright"
    )
}

8 Descriptive Statistics

8.1 Summary Statistics

data.frame(
  Trait = c("AFC (months)", "Daily Milk (kg)", "Lactation Milk (kg)", "Calving Interval (days)"),
  N = c(nrow(afc_data), nrow(daily_milk_data), nrow(tly_data), nrow(ci_data)),
  Mean = c(mean(afc_data$afc_months), mean(daily_milk_data$daily_yield),
           mean(tly_data$total_lactation_yield), mean(ci_data$calving_interval_days)),
  SD = c(sd(afc_data$afc_months), sd(daily_milk_data$daily_yield),
         sd(tly_data$total_lactation_yield), sd(ci_data$calving_interval_days)),
  Min = c(min(afc_data$afc_months), min(daily_milk_data$daily_yield),
          min(tly_data$total_lactation_yield), min(ci_data$calving_interval_days)),
  Median = c(median(afc_data$afc_months), median(daily_milk_data$daily_yield),
             median(tly_data$total_lactation_yield), median(ci_data$calving_interval_days)),
  Max = c(max(afc_data$afc_months), max(daily_milk_data$daily_yield),
          max(tly_data$total_lactation_yield), max(ci_data$calving_interval_days)),
  CV = c(sd(afc_data$afc_months)/mean(afc_data$afc_months)*100,
         sd(daily_milk_data$daily_yield)/mean(daily_milk_data$daily_yield)*100,
         sd(tly_data$total_lactation_yield)/mean(tly_data$total_lactation_yield)*100,
         sd(ci_data$calving_interval_days)/mean(ci_data$calving_interval_days)*100)
) %>%
  mutate(N = format(N, big.mark = ","),
         across(c(Mean, SD, CV, Min, Median, Max), ~round(., 2))) %>%
  styled_table(caption = "Descriptive statistics for all traits")

Descriptive statistics for all traits
Trait	N	Mean	SD	Min	Median	Max	CV
AFC (months)	27,503	39.34	11.90	18.00	36.0	72.0	30.24
Daily Milk (kg)	582,060	9.24	7.07	0.51	9.2	50.0	76.54
Lactation Milk (kg)	9,558	979.05	1706.33	100.00	364.3	19583.5	174.28
Calving Interval (days)	44,049	459.45	130.92	280.00	416.0	900.0	28.49

8.2 Distribution Plots

p_afc <- ggplot(afc_data, aes(x = afc_months)) +
  geom_histogram(aes(y = after_stat(density)), bins = 35, 
                 fill = trait_colors["AFC"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#9a3412") +
  geom_vline(xintercept = mean(afc_data$afc_months), linetype = "dashed", 
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(afc_data$afc_months) + 3, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", round(mean(afc_data$afc_months), 1), " mo"),
           fontface = "bold", size = 3.5) +
  labs(title = "Age at First Calving", x = "AFC (months)", y = "Density")

p_dm <- ggplot(daily_milk_data, aes(x = daily_yield)) +
  geom_histogram(aes(y = after_stat(density)), bins = 45, 
                 fill = trait_colors["Daily Milk"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#1e3a8a") +
  geom_vline(xintercept = mean(daily_milk_data$daily_yield), linetype = "dashed",
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(daily_milk_data$daily_yield) + 2, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", round(mean(daily_milk_data$daily_yield), 1), " kg"),
           fontface = "bold", size = 3.5) +
  labs(title = "Daily Milk Yield", x = "Daily Yield (kg)", y = "Density")

p_tly <- ggplot(tly_data, aes(x = total_lactation_yield)) +
  geom_histogram(aes(y = after_stat(density)), bins = 45, 
                 fill = trait_colors["Lactation Milk"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#065f46") +
  geom_vline(xintercept = mean(tly_data$total_lactation_yield), linetype = "dashed",
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(tly_data$total_lactation_yield) + 500, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", format(round(mean(tly_data$total_lactation_yield), 0), big.mark = ","), " kg"),
           fontface = "bold", size = 3.5) +
  scale_x_continuous(labels = comma) +
  labs(title = "Total Lactation Yield", x = "Lactation Yield (kg)", y = "Density")

p_ci <- ggplot(ci_data, aes(x = calving_interval_days)) +
  geom_histogram(aes(y = after_stat(density)), bins = 40, 
                 fill = trait_colors["Calving Interval"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#4c1d95") +
  geom_vline(xintercept = mean(ci_data$calving_interval_days), linetype = "dashed",
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(ci_data$calving_interval_days) + 30, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", round(mean(ci_data$calving_interval_days), 0), " d"),
           fontface = "bold", size = 3.5) +
  labs(title = "Calving Interval", x = "Calving Interval (days)", y = "Density")

(p_afc + p_dm) / (p_tly + p_ci) +
  plot_annotation(
    title = "Distribution of Dairy Cattle Traits in Kenya",
    subtitle = "Dashed lines indicate population means",
    theme = theme(plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
                  plot.subtitle = element_text(size = 12, hjust = 0.5, color = "#6B7280"))
  )

8.3 Box Plots by Region

# Boxplots with improved spacing
p1_box <- ggplot(afc_data, aes(x = reorder(region, afc_months, FUN = median), y = afc_months)) +
  geom_boxplot(fill = trait_colors["AFC"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "darkred") +
  stat_summary(fun = mean, geom = "text", aes(label = round(after_stat(y), 1)), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Age at First Calving by Region", x = NULL, y = "AFC (months)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

p2_box <- ggplot(daily_milk_data, aes(x = reorder(region, daily_yield, FUN = median), y = daily_yield)) +
  geom_boxplot(fill = trait_colors["Daily Milk"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "darkblue") +
  stat_summary(fun = mean, geom = "text", aes(label = round(after_stat(y), 1)), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Daily Milk Yield by Region", x = NULL, y = "Daily Milk (kg)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

p3_box <- ggplot(tly_data, aes(x = reorder(region, total_lactation_yield, FUN = median), y = total_lactation_yield)) +
  geom_boxplot(fill = trait_colors["Lactation Milk"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "darkgreen") +
  stat_summary(fun = mean, geom = "text", aes(label = format(round(after_stat(y), 0), big.mark = ",")), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Total Lactation Yield by Region", x = NULL, y = "Lactation Milk (kg)") +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

p4_box <- ggplot(ci_data, aes(x = reorder(region, calving_interval_days, FUN = median), y = calving_interval_days)) +
  geom_boxplot(fill = trait_colors["Calving Interval"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "purple4") +
  stat_summary(fun = mean, geom = "text", aes(label = round(after_stat(y), 0)), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Calving Interval by Region", x = NULL, y = "CI (days)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

(p1_box + p2_box) / (p3_box + p4_box) +
  plot_annotation(
    title = "Trait Distribution by Region",
    subtitle = "Diamond markers show mean values; numbers above show regional means",
    theme = theme(plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
                  plot.subtitle = element_text(size = 11, hjust = 0.5, color = "#6B7280"))
  )

8.4 Lactation Curve

dim_summary <- daily_milk_data %>%
  mutate(dim_class_num = as.numeric(as.character(dim_class))) %>%
  group_by(dim_class_num) %>%
  summarise(N = n(), Mean = mean(daily_yield), SD = sd(daily_yield),
            SE = SD / sqrt(N), Lower = Mean - 1.96 * SE, Upper = Mean + 1.96 * SE,
            .groups = 'drop') %>%
  mutate(DIM_midpoint = (dim_class_num - 0.5) * 30)

peak_row <- dim_summary[which.max(dim_summary$Mean), ]

ggplot(dim_summary, aes(x = DIM_midpoint, y = Mean)) +
  geom_ribbon(aes(ymin = Lower, ymax = Upper), fill = trait_colors["Daily Milk"], alpha = 0.25) +
  geom_line(linewidth = 1.5, color = trait_colors["Daily Milk"]) +
  geom_point(size = 3.5, color = trait_colors["Daily Milk"], fill = "white", shape = 21, stroke = 1.5) +
  geom_vline(xintercept = peak_row$DIM_midpoint, linetype = "dashed", color = "#dc2626", linewidth = 0.8) +
  geom_point(data = peak_row, aes(x = DIM_midpoint, y = Mean), size = 6, color = "#dc2626", shape = 18) +
  annotate("text", x = peak_row$DIM_midpoint + 35, y = peak_row$Mean + 0.5,
           label = paste0("Peak: ", round(peak_row$Mean, 1), " kg\n@ DIM ", round(peak_row$DIM_midpoint, 0)),
           color = "#dc2626", fontface = "bold", size = 4) +
  scale_x_continuous(breaks = seq(0, 500, 50), limits = c(0, 520)) +
  labs(title = "Average Lactation Curve of Kenya Dairy Cattle",
       subtitle = "Mean daily milk yield (±95% CI) across days in milk (DIM)",
       x = "Days in Milk (DIM)", y = "Daily Milk Yield (kg)",
       caption = paste("Based on", format(nrow(daily_milk_data), big.mark = ","), "test-day records"))

9 Fixed Effects Analysis

9.1 Linear Models

afc_model <- lm(afc_months ~ region + breed, data = afc_data)
dm_model <- lm(daily_yield ~ region + breed + parity_class + dim_class, data = daily_milk_data)
tly_model <- lm(total_lactation_yield ~ region + breed + parity_class, data = tly_data)
ci_model <- lm(calving_interval_days ~ region + breed + parity_class, data = ci_data)

data.frame(
  Trait = c("AFC", "Daily Milk", "Lactation Yield", "Calving Interval"),
  Model = c("Region + Breed", "Region + Breed + Parity + DIM",
            "Region + Breed + Parity", "Region + Breed + Parity"),
  N = c(nrow(afc_data), nrow(daily_milk_data), nrow(tly_data), nrow(ci_data)),
  R2 = c(summary(afc_model)$r.squared, summary(dm_model)$r.squared,
         summary(tly_model)$r.squared, summary(ci_model)$r.squared) * 100,
  Adj_R2 = c(summary(afc_model)$adj.r.squared, summary(dm_model)$adj.r.squared,
             summary(tly_model)$adj.r.squared, summary(ci_model)$adj.r.squared) * 100
) %>%
  mutate(N = format(N, big.mark = ","),
         R2 = paste0(round(R2, 2), "%"),
         Adj_R2 = paste0(round(Adj_R2, 2), "%")) %>%
  rename(`R² (%)` = R2, `Adj. R² (%)` = Adj_R2) %>%
  styled_table(caption = "Variance explained by fixed effects models")

Variance explained by fixed effects models
Trait	Model	N	R² (%)	Adj. R² (%)
AFC	Region + Breed	27,503	22.07%	21.9%
Daily Milk	Region + Breed + Parity + DIM	582,060	53.8%	53.8%
Lactation Yield	Region + Breed + Parity	9,558	36.27%	35.98%
Calving Interval	Region + Breed + Parity	44,049	6.3%	6.18%

9.2 R² Visualization

r2_data <- data.frame(
  Trait = c("AFC", "Daily Milk", "Lactation Yield", "Calving Interval"),
  R_squared = c(summary(afc_model)$r.squared, summary(dm_model)$r.squared,
                summary(tly_model)$r.squared, summary(ci_model)$r.squared) * 100
)

ggplot(r2_data, aes(x = reorder(Trait, R_squared), y = R_squared, fill = Trait)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = paste0(round(R_squared, 1), "%")), hjust = -0.2, size = 5, fontface = "bold") +
  scale_fill_manual(values = trait_colors) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)), limits = c(0, 100)) +
  coord_flip() +
  labs(title = "Variance Explained by Fixed Effects",
       subtitle = "DIM class has major impact on daily milk yield",
       x = NULL, y = "R-squared (%)")

10 Key Findings

10.0.1 📊 Major Findings

Breed Effects: Holstein-Friesian and improved crossbreds outperform indigenous breeds
Regional Variation: Significant differences exist between regions - visualized in interactive maps
Lactation Dynamics: Peak production occurs around DIM 30-60
Fixed Effects Impact: DIM explains substantial variation in daily milk yields
Geographic Clustering: Major dairy production concentrated in Central and Rift Valley regions

11 Session Information

sessionInfo()

R version 4.5.1 (2025-06-13 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_Kenya.utf8  LC_CTYPE=English_Kenya.utf8   
[3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C                  
[5] LC_TIME=English_Kenya.utf8    

time zone: Africa/Nairobi
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] RColorBrewer_1.1-3   htmltools_0.5.8.1    sf_1.0-24           
 [4] leaflet.extras_2.0.1 leaflet_2.2.3        patchwork_1.3.2     
 [7] broom_1.0.9          scales_1.4.0         kableExtra_1.4.0    
[10] knitr_1.50           ggplot2_3.5.2        tidyr_1.3.1         
[13] dplyr_1.1.4          data.table_1.17.8   

loaded via a namespace (and not attached):
 [1] sass_0.4.10             generics_0.1.4          class_7.3-23           
 [4] xml2_1.4.0              KernSmooth_2.23-26      stringi_1.8.7          
 [7] digest_0.6.37           magrittr_2.0.3          evaluate_1.0.5         
[10] grid_4.5.1              fastmap_1.2.0           jsonlite_2.0.0         
[13] e1071_1.7-17            backports_1.5.0         DBI_1.2.3              
[16] purrr_1.1.0             crosstalk_1.2.2         viridisLite_0.4.2      
[19] textshaping_1.0.2       jquerylib_0.1.4         cli_3.6.5              
[22] rlang_1.1.6             units_1.0-0             withr_3.0.2            
[25] cachem_1.1.0            yaml_2.3.10             tools_4.5.1            
[28] mime_0.13               vctrs_0.6.5             R6_2.6.1               
[31] proxy_0.4-29            lifecycle_1.0.4         classInt_0.4-11        
[34] stringr_1.5.1           leaflet.providers_2.0.0 htmlwidgets_1.6.4      
[37] pkgconfig_2.0.3         pillar_1.11.0           bslib_0.9.0            
[40] gtable_0.3.6            Rcpp_1.1.0              glue_1.8.0             
[43] systemfonts_1.2.3       xfun_0.52               tibble_3.3.0           
[46] tidyselect_1.2.1        rstudioapi_0.17.1       farver_2.1.2           
[49] labeling_0.4.3          rmarkdown_2.29          svglite_2.2.1          
[52] compiler_4.5.1

Genetic Evaluation of Dairy Cattle Traits in Kenya

Phenotypic Analysis and Fixed Effects Modeling

Setegn Worku Alemu

January 21, 2026