1 Introduction

1.1 Project Overview

🎯 Project Context: The Tropical Dairy Genetic Gains (TDGG) project aims to establish sustainable genetic improvement programs for smallholder dairy systems in East Africa, funded by the Bill & Melinda Gates Foundation.

1.1.1 Study Objectives

  1. Characterize phenotypic distributions of key dairy traits
  2. Quantify fixed effects (Region, Breed, Parity, DIM)
  3. Prepare data for genetic evaluation using BLUPF90

1.2 Traits Analyzed

Economic traits under genetic evaluation
Trait Unit Heritability Importance
Age at First Calving (AFC) months 0.05 - 0.15 Reproductive efficiency
Daily Milk Yield (DMY) kg/day 0.15 - 0.30 Daily productivity
Total Lactation Yield (TLY) kg/lactation 0.20 - 0.35 Total production
Calving Interval (CI) days 0.02 - 0.06 Reproductive interval

2 Data Loading

2.1 Load Data

# ==============================================================================
# SET YOUR DATA PATH HERE
# ==============================================================================
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/Version2/"
output_path <- paste0(data_path, "BLUPF90CL/")

if(!dir.exists(output_path)) dir.create(output_path, recursive = TRUE)

cat("═══════════════════════════════════════════════════════════════════════════\n")
═══════════════════════════════════════════════════════════════════════════
cat("                     LOADING KENYA DAIRY DATA FILES                         \n")
                     LOADING KENYA DAIRY DATA FILES                         
cat("═══════════════════════════════════════════════════════════════════════════\n\n")
═══════════════════════════════════════════════════════════════════════════
# Pedigree data
pedigree <- fread(paste0(data_path, "pedigree-extract.csv"),
                  na.strings = c("NULL", "null", "NA", "N/A", "", " "),
                  stringsAsFactors = FALSE)

# Lactation data
lactation <- fread(paste0(data_path, "lactation_calving_improved(in).csv"),
                   na.strings = c("NULL", "null", "NA", "N/A", "", " "),
                   stringsAsFactors = FALSE)

# Milking data
milking <- fread(paste0(data_path, "milking_data_improved.csv"),
                 na.strings = c("NULL", "null", "NA", "N/A", "", " "),
                 stringsAsFactors = FALSE)

cat("✓ Pedigree:", format(nrow(pedigree), big.mark = ","), "records\n")
✓ Pedigree: 248,208 records
cat("✓ Lactation:", format(nrow(lactation), big.mark = ","), "records\n")
✓ Lactation: 88,352 records
cat("✓ Milking:", format(nrow(milking), big.mark = ","), "records\n\n")
✓ Milking: 1,048,575 records
# Show column names for debugging
cat("Pedigree columns:", paste(names(pedigree), collapse = ", "), "\n\n")
Pedigree columns: country, region, district, ward, village, farmer_name, farm_id, org_id, organization_name, project, animal_id, tag_id, original_tag_id, sire_tag_id, sire_id, dam_tag_id, dam_id, sex, estimated_sex, reg_date, birthdate, main_breed, breed, longitude, latitude, warning, error, Extract_Datetime_UTC, Version 
cat("Lactation columns:", paste(names(lactation), collapse = ", "), "\n\n")
Lactation columns: farm_id, project, animal_id, tag_id, original_tag_id, animal_name, animal_registration_date, birthdate, sex, animal_sex, animal_type, animal_type_name, main_breed, main_breed_name, breed_composition, breed_proportion, secondary_breed_id, secondary_breed_name, secondary_breed_other, lactation_id, calving_date, parity, afc_months, afc_days, calving_interval_days, calving_interval_months, lactation_length_days, lactation_length_months, total_lactation_yield, testday_count, days_open, org_id, project_name, V34 
cat("Milking columns:", paste(names(milking), collapse = ", "), "\n")
Milking columns: farm_id, project, animal_id, tag_id, original_tag_id, animal_name, animal_registration_date, birthdate, sex, animal_sex, main_breed, main_breed_name, breed_composition, breed_proportion, secondary_breed_id, secondary_breed_name, secondary_breed_other, lactation_id, calving_date, parity, afc_months, afc_days, calving_age_months, calving_age_days, milk_event_id, milking_date, testday_no, dim, milk_morning, milk_evening, milk_midday, daily_yield, data_completeness_score, org_id, project_name, record_created_date 

2.2 Data Structure

head(pedigree, 5) %>%
  select(any_of(c("animal_id", "sire_id", "dam_id", "sex", "breed", "main_breed", 
                  "region", "district", "farm_id", "country"))) %>%
  styled_table(caption = "Sample pedigree records")
Sample pedigree records
animal_id sire_id dam_id sex breed main_breed region district farm_id country
170285 NA NA FEMALE Ayrshire 3 Migori Kuria West 515160 Kenya
170298 0 NA FEMALE Ayrshire 3 Kakamega Likuyani 515626 Kenya
170315 NA NA FEMALE Ayrshire 3 Kakamega Lugari 598316 Kenya
170328 NA NA FEMALE Ayrshire 3 Kakamega Likuyani 595412 Kenya
170506 NA NA FEMALE Local Zebu 26 Homa Bay Ndhiwa 514950 Kenya

3 Pedigree Analysis

3.1 Pedigree Statistics

n_animals <- nrow(pedigree)
n_sires <- n_distinct(pedigree$sire_id[!is.na(pedigree$sire_id)])
n_dams <- n_distinct(pedigree$dam_id[!is.na(pedigree$dam_id)])
n_founders <- sum(is.na(pedigree$sire_id) & is.na(pedigree$dam_id))
pct_known_sire <- mean(!is.na(pedigree$sire_id)) * 100
pct_known_dam <- mean(!is.na(pedigree$dam_id)) * 100

data.frame(
  Statistic = c("Total Animals", "Unique Sires", "Unique Dams", 
                "Founders", "Known Sire (%)", "Known Dam (%)"),
  Value = c(format(n_animals, big.mark = ","), format(n_sires, big.mark = ","),
            format(n_dams, big.mark = ","), format(n_founders, big.mark = ","),
            paste0(round(pct_known_sire, 1), "%"), paste0(round(pct_known_dam, 1), "%"))
) %>% styled_table(caption = "Pedigree completeness statistics")
Pedigree completeness statistics
Statistic Value
Total Animals 248,208
Unique Sires 8,415
Unique Dams 44,062
Founders 151,283
Known Sire (%) 32.1%
Known Dam (%) 28.7%

3.2 Geographic Distribution

# Create summary data
region_summary <- pedigree %>%
  filter(!is.na(region)) %>%
  group_by(region) %>%
  summarise(
    Animals = n(), 
    Farms = n_distinct(farm_id), 
    Districts = n_distinct(district), 
    .groups = 'drop'
  ) %>%
  arrange(desc(Animals)) %>%
  mutate(
    Percentage = round(Animals / sum(Animals) * 100, 1),
    # Create label with region name and count
    label_animals = paste0(region, ": ", format(Animals, big.mark = ","), " (", Percentage, "%)"),
    label_farms = paste0(region, ": ", format(Farms, big.mark = ","))
  )

# Display as a nice table first
region_summary %>%
  select(Region = region, Animals, `%` = Percentage, Farms, Districts) %>%
  mutate(Animals = format(Animals, big.mark = ","),
         Farms = format(Farms, big.mark = ",")) %>%
  styled_table(caption = "Geographic distribution summary")
Geographic distribution summary
Region Animals % Farms Districts
Nakuru 56,756 27.9 1,680 12
Makueni 15,293 7.5 3,265 8
Trans Nzoia 14,302 7.0 321 5
Uasin Gishu 13,297 6.5 1,462 7
Kiambu 11,948 5.9 948 11
Nairobi 11,849 5.8 489 10
Nyeri 10,100 5.0 1,475 6
Laikipia 9,590 4.7 329 3
Muranga 6,749 3.3 1,708 8
Nyandarua 5,973 2.9 1,100 8
Meru 5,710 2.8 1,131 6
Kisumu 4,290 2.1 1,382 8
Narok 4,251 2.1 117 6
Kakamega 4,028 2.0 1,277 12
Nandi County 2,484 1.2 549 5
Siaya 2,442 1.2 1,068 7
Kajiado 2,129 1.0 138 5
Kilifi 1,738 0.9 8 3
Bungoma 1,736 0.9 512 9
Embu 1,603 0.8 209 3
Taita Taveta 1,587 0.8 181 6
Homa Bay 1,584 0.8 838 9
Busia 1,553 0.8 720 8
Migori 1,504 0.7 825 10
Kisii 1,240 0.6 273 10
Baringo 1,237 0.6 101 6
Bomet 1,002 0.5 193 6
Tharaka Nithi 981 0.5 200 3
Vihiga 969 0.5 431 6
Kitui 862 0.4 516 9
Elgeyo Marakwet 823 0.4 85 4
Machakos 800 0.4 108 6
Kericho 788 0.4 95 5
Kirinyaga 739 0.4 84 4
Nyamira 667 0.3 196 5
Mombasa 400 0.2 7 1
West Pokot 118 0.1 9 2
Marsabit 58 0.0 14 2
Kwale 45 0.0 2 2
Oromia 18 0.0 1 1
Mandera 8 0.0 1 1
Tana River 6 0.0 3 1
Amhara 5 0.0 3 2
Kolero Ward 3 0.0 1 1
Osun 1 0.0 1 1
# Side-by-side horizontal bar plots with values displayed inside bars
p1 <- ggplot(region_summary, aes(x = reorder(region, Animals), y = Animals)) +

  geom_col(aes(fill = Animals), width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(format(Animals, big.mark = ","), " (", Percentage, "%)")), 
            hjust = 1.1, size = 4, fontface = "bold", color = "white") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05)), labels = comma) +
  scale_fill_gradient(low = "#3B82F6", high = "#1e3a8a") +
  coord_flip() +
  labs(
    title = "Animals by Region",
    x = NULL, 
    y = "Number of Animals"
  ) +
  theme(
    axis.text.y = element_text(size = 11, face = "bold"),
    plot.title = element_text(size = 14)
  )

p2 <- ggplot(region_summary, aes(x = reorder(region, Farms), y = Farms)) +
  geom_col(aes(fill = Farms), width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = format(Farms, big.mark = ",")), 
            hjust = 1.1, size = 4, fontface = "bold", color = "white") +
  scale_fill_gradient(low = "#10B981", high = "#065f46") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  coord_flip() +
  labs(
    title = "Farms by Region",
    x = NULL, 
    y = "Number of Farms"
  ) +
  theme(
    axis.text.y = element_text(size = 11, face = "bold"),
    plot.title = element_text(size = 14)
  )

p1 + p2 + plot_annotation(
  title = "Geographic Coverage of Kenya Dairy Genetic Evaluation",
  theme = theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
)


4 Interactive Maps

4.1 Regional Map

🗺️ Interactive Visualization: Explore the geographic distribution of dairy cattle and farms across Kenya’s counties. Click on markers for detailed statistics.

# ==============================================================================
# KENYA COUNTY COORDINATES
# ==============================================================================
# Approximate centroids for Kenya's major dairy regions
# You may need to adjust these based on your actual region names in the data

kenya_coords <- data.frame(
  region = c(
    # Major dairy regions (Rift Valley & Central)
    "Nakuru", "Nyandarua", "Kiambu", "Nyeri", "Murang'a", "Kirinyaga",
    "Laikipia", "Narok", "Baringo", "Uasin Gishu", "Trans Nzoia", "Elgeyo Marakwet",
    "Nandi", "Kericho", "Bomet", "West Pokot",
    # Western Kenya
    "Kakamega", "Bungoma", "Vihiga", "Busia",
    # Nyanza
    "Kisii", "Nyamira", "Kisumu", "Siaya", "Homa Bay", "Migori",
    # Eastern
    "Meru", "Embu", "Tharaka-Nithi", "Machakos", "Makueni", "Kitui",
    # Coast
    "Kilifi", "Kwale", "Mombasa", "Taita Taveta",
    # Nairobi & surroundings
    "Nairobi", "Kajiado"
  ),
  lat = c(
    -0.2833, -0.4000, -1.0000, -0.4167, -0.7833, -0.5000,
    0.2000, -1.1000, 0.4667, 0.5167, 1.0000, 0.9500,
    0.1833, -0.3667, -0.7833, 1.6167,
    0.2833, 0.5667, 0.0500, 0.4500,
    -0.6833, -0.5667, -0.1000, 0.0617, -0.5167, -1.0667,
    0.0500, -0.5333, -0.3000, -1.5167, -1.8000, -1.3667,
    -3.5000, -4.1833, -4.0500, -3.4000,
    -1.2833, -1.8500
  ),
  lng = c(
    36.0667, 36.4333, 36.8333, 37.0000, 37.0000, 37.3333,
    36.7833, 35.8667, 35.9667, 35.2833, 35.0000, 35.5000,
    35.1833, 35.2833, 35.3333, 35.1000,
    34.7500, 34.5667, 34.7333, 34.1333,
    34.7667, 34.9333, 34.7500, 34.2500, 34.4500, 34.4667,
    37.6500, 37.4500, 37.8500, 37.2667, 37.6167, 38.0000,
    39.8500, 39.4500, 39.6667, 38.3500,
    36.8167, 36.7833
  ),
  stringsAsFactors = FALSE
)

# Merge with region summary data
map_data <- region_summary %>%
  left_join(kenya_coords, by = "region") %>%
  filter(!is.na(lat), !is.na(lng))

# Check for unmatched regions
unmatched <- region_summary %>%
  anti_join(kenya_coords, by = "region")

if(nrow(unmatched) > 0) {
  cat("Note: The following regions could not be mapped (coordinates not found):\n")
  cat(paste(unmatched$region, collapse = ", "), "\n")
  cat("You may need to add their coordinates to the kenya_coords dataframe.\n\n")
}
Note: The following regions could not be mapped (coordinates not found):
Muranga, Nandi County, Tharaka Nithi, Marsabit, Oromia, Mandera, Tana River, Amhara, Kolero Ward, Osun 
You may need to add their coordinates to the kenya_coords dataframe.
# ==============================================================================
# INTERACTIVE BUBBLE MAP - Animals by Region
# ==============================================================================

if(nrow(map_data) > 0) {
  
  # Create color palette based on number of animals
  pal <- colorNumeric(
    palette = c("#3B82F6", "#1e40af", "#1e3a8a"),
    domain = map_data$Animals
  )
  
  # Scale bubble size (sqrt transformation for better visual scaling)
  map_data <- map_data %>%
    mutate(
      radius = sqrt(Animals) / max(sqrt(Animals)) * 40 + 5,
      popup_content = paste0(
        "<div style='font-family: sans-serif; padding: 10px;'>",
        "<h4 style='color: #1a1f36; margin: 0 0 10px 0; border-bottom: 2px solid #D4AF37; padding-bottom: 5px;'>",
        "<strong>", region, "</strong></h4>",
        "<table style='width: 100%; font-size: 13px;'>",
        "<tr><td style='padding: 3px 10px 3px 0;'>🐄 Animals:</td><td style='font-weight: bold; color: #3B82F6;'>", 
        format(Animals, big.mark = ","), "</td></tr>",
        "<tr><td style='padding: 3px 10px 3px 0;'>🏠 Farms:</td><td style='font-weight: bold; color: #10B981;'>", 
        format(Farms, big.mark = ","), "</td></tr>",
        "<tr><td style='padding: 3px 10px 3px 0;'>📍 Districts:</td><td style='font-weight: bold; color: #7C3AED;'>", 
        Districts, "</td></tr>",
        "<tr><td style='padding: 3px 10px 3px 0;'>📊 Share:</td><td style='font-weight: bold; color: #F97316;'>", 
        Percentage, "%</td></tr>",
        "</table></div>"
      )
    )
  
  # Create the map
  leaflet(map_data, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Positron", group = "Light") %>%
    addProviderTiles("Esri.WorldImagery", group = "Satellite") %>%
    addProviderTiles("OpenStreetMap", group = "OpenStreetMap") %>%
    addProviderTiles("Esri.WorldTopoMap", group = "Topographic") %>%
    
    # Add circle markers
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = ~radius,
      color = "#1e3a8a",
      weight = 2,
      opacity = 0.9,
      fillColor = ~pal(Animals),
      fillOpacity = 0.7,
      popup = ~popup_content,
      label = ~paste0(region, ": ", format(Animals, big.mark = ","), " animals"),
      labelOptions = labelOptions(
        style = list("font-weight" = "bold", "font-size" = "12px"),
        textsize = "12px",
        direction = "auto"
      ),
      group = "Animals"
    ) %>%
    
    # Add legend
    addLegend(
      position = "bottomright",
      pal = pal,
      values = ~Animals,
      title = "Number of<br>Animals",
      opacity = 0.9,
      labFormat = labelFormat(big.mark = ",")
    ) %>%
    
    # Add layer controls
    addLayersControl(
      baseGroups = c("Light", "Satellite", "OpenStreetMap", "Topographic"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    # Add scale bar
    addScaleBar(position = "bottomleft") %>%
    
    # Set default view to Kenya
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    # Add title
    addControl(
      html = "<div style='background: white; padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>🐄 Kenya Dairy Cattle Distribution</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Click markers for detailed statistics</span></div>",
      position = "topright"
    )
  
} else {
  cat("No regions could be mapped. Please check that region names in your data match the coordinate lookup table.\n")
}

4.2 Farm Density Heatmap

# ==============================================================================
# HEATMAP VISUALIZATION - Farm Density
# ==============================================================================

if(nrow(map_data) > 0) {
  
  # Create expanded points for heatmap effect (simulate density)
  # Weight by number of farms
  heatmap_points <- map_data %>%
    slice(rep(1:n(), times = ceiling(Farms/100))) %>%
    mutate(
      # Add small random jitter for visual effect
      lat_jitter = lat + runif(n(), -0.15, 0.15),
      lng_jitter = lng + runif(n(), -0.15, 0.15)
    )
  
  leaflet(width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.DarkMatter", group = "Dark") %>%
    addProviderTiles("CartoDB.Positron", group = "Light") %>%
    
    # Add heatmap layer
    addHeatmap(
      data = heatmap_points,
      lng = ~lng_jitter, lat = ~lat_jitter,
      intensity = ~Farms/max(Farms),
      blur = 25,
      max = 0.8,
      radius = 30,
      gradient = c(
        "0" = "transparent",
        "0.2" = "#3B82F6",
        "0.4" = "#10B981", 
        "0.6" = "#F97316",
        "0.8" = "#EF4444",
        "1" = "#DC2626"
      ),
      group = "Heatmap"
    ) %>%
    
    # Add region labels
    addLabelOnlyMarkers(
      data = map_data,
      lng = ~lng, lat = ~lat,
      label = ~region,
      labelOptions = labelOptions(
        noHide = TRUE,
        direction = "center",
        textOnly = TRUE,
        style = list(
          "color" = "white",
          "font-weight" = "bold",
          "font-size" = "11px",
          "text-shadow" = "1px 1px 2px black"
        )
      ),
      group = "Labels"
    ) %>%
    
    addLayersControl(
      baseGroups = c("Dark", "Light"),
      overlayGroups = c("Heatmap", "Labels"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: rgba(26,31,54,0.9); padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.3); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: white;'>🔥 Farm Density Heatmap</strong><br>
              <span style='font-size: 12px; color: #9CA3AF;'>Intensity reflects concentration of dairy farms</span></div>",
      position = "topright"
    )
}

4.3 Clustered Markers

# ==============================================================================
# CLUSTERED MARKERS - Interactive Farm Exploration
# ==============================================================================

if(nrow(map_data) > 0) {
  
  # Assign colors based on animal count
  map_data <- map_data %>%
    mutate(
      marker_color = case_when(
        Animals > 10000 ~ "#DC2626",
        Animals > 5000 ~ "#F97316",
        Animals > 1000 ~ "#0891B2",
        TRUE ~ "#3B82F6"
      )
    )
  
  leaflet(map_data, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Voyager", group = "Voyager") %>%
    addProviderTiles("Esri.WorldStreetMap", group = "Streets") %>%
    addProviderTiles("Esri.WorldTopoMap", group = "Terrain") %>%
    
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = 12,
      color = ~marker_color,
      weight = 2,
      opacity = 1,
      fillColor = ~marker_color,
      fillOpacity = 0.7,
      popup = ~popup_content,
      label = ~paste0(region, " (", format(Animals, big.mark = ","), ")"),
      clusterOptions = markerClusterOptions(
        iconCreateFunction = JS("
          function(cluster) {
            var childCount = cluster.getChildCount();
            var c = ' marker-cluster-';
            if (childCount < 5) {
              c += 'small';
            } else if (childCount < 10) {
              c += 'medium';
            } else {
              c += 'large';
            }
            return new L.DivIcon({ 
              html: '<div><span>' + childCount + '</span></div>', 
              className: 'marker-cluster' + c, 
              iconSize: new L.Point(40, 40) 
            });
          }
        ")
      )
    ) %>%
    
    addLayersControl(
      baseGroups = c("Voyager", "Streets", "Terrain"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: white; padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>📍 Regional Clusters</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Click clusters to zoom, markers for details</span><br>
              <div style='margin-top: 8px; font-size: 11px;'>
              <span style='color: #DC2626;'>●</span> >10,000 &nbsp;
              <span style='color: #F97316;'>●</span> 5,000-10,000 &nbsp;
              <span style='color: #0891B2;'>●</span> 1,000-5,000 &nbsp;
              <span style='color: #3B82F6;'>●</span> <1,000
              </div></div>",
      position = "topright"
    )
}

4.4 Trait Performance Map

# ==============================================================================
# PREPARE TRAIT DATA BY REGION FOR MAPPING
# (This section will be populated after phenotype data is processed)
# ==============================================================================

# Note: This chunk prepares the trait summary data for mapping
# It will be displayed after the phenotypic data preparation section below

5 Breed Composition

# Determine which breed column exists
breed_col <- if("breed" %in% names(pedigree)) "breed" else if("main_breed" %in% names(pedigree)) "main_breed" else NULL

if(!is.null(breed_col)) {
  breed_summary <- pedigree %>%
    filter(!is.na(.data[[breed_col]])) %>%
    count(breed_val = .data[[breed_col]], name = "Count") %>%
    arrange(desc(Count)) %>%
    head(12) %>%
    mutate(Percentage = round(Count / sum(Count) * 100, 1))
  
  # Show table first
  breed_summary %>%
    rename(Breed = breed_val, Animals = Count, `%` = Percentage) %>%
    mutate(Animals = format(Animals, big.mark = ",")) %>%
    styled_table(caption = "Breed composition of study population")
}
Breed composition of study population
Breed Animals %
Holstein Friesian 109,158 51.2
Ayrshire 32,484 15.2
Sahiwal 20,880 9.8
Boran 18,976 8.9
Jersey 8,631 4.1
Guernsey 6,293 3.0
Holstein-Friesian crosses 3,726 1.7
Unknown 3,484 1.6
Others 3,341 1.6
Simmental 2,824 1.3
Local Zebu 2,169 1.0
Sahiwal crosses 1,143 0.5
if(!is.null(breed_col)) {
  breed_summary <- breed_summary %>%
    mutate(breed_val = factor(breed_val, levels = rev(breed_val)))
  
  ggplot(breed_summary, aes(x = breed_val, y = Count)) +
    geom_col(aes(fill = Count), width = 0.7, show.legend = FALSE) +
    geom_text(aes(label = paste0(format(Count, big.mark = ","), " (", Percentage, "%)")),
              hjust = 1.1, size = 3.8, fontface = "bold", color = "white") +
    scale_fill_gradient(low = "#F97316", high = "#9a3412") +
    scale_y_continuous(expand = expansion(mult = c(0, 0.05)), labels = comma) +
    coord_flip() +
    labs(
      title = "Breed Composition of Study Population",
      subtitle = paste("Total:", format(sum(breed_summary$Count), big.mark = ","), "animals"),
      x = NULL, 
      y = "Number of Animals"
    ) +
    theme(
      axis.text.y = element_text(size = 10, face = "bold"),
      plot.title = element_text(size = 14)
    )
}


6 Phenotypic Data Preparation

6.1 Data Filtering

# ==============================================================================
# PREPARE LOOKUP TABLE FROM PEDIGREE (region only - farm_id is in phenotype files)
# ==============================================================================

# Determine breed column in pedigree
breed_col_ped <- if("breed" %in% names(pedigree)) "breed" else "main_breed"

pedigree_lookup <- pedigree %>%
  mutate(animal_id = as.character(animal_id)) %>%
  select(animal_id, region, district, any_of(breed_col_ped)) %>%
  rename(breed_ped = any_of(breed_col_ped)) %>%
  group_by(animal_id) %>%
  slice(1) %>%
  ungroup()

cat("Pedigree lookup created with", nrow(pedigree_lookup), "animals\n\n")
Pedigree lookup created with 248208 animals
# ==============================================================================
# AGE AT FIRST CALVING (AFC) DATA
# ==============================================================================
cat("Processing AFC data...\n")
Processing AFC data...
afc_data <- lactation %>%
  mutate(
    animal_id = as.character(animal_id),
    afc_months = as.numeric(afc_months),
    parity = as.numeric(parity)
  ) %>%
  filter(
    parity == 1,
    !is.na(afc_months),
    afc_months >= 18,
    afc_months <= 72
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    # Use main_breed_name from lactation if available, otherwise from pedigree
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  group_by(animal_id) %>%
  slice(1) %>%
  ungroup() %>%
  mutate(
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("AFC records after filtering:", format(nrow(afc_data), big.mark = ","), "\n")
AFC records after filtering: 27,503 
# ==============================================================================
# DAILY MILK YIELD (TEST-DAY) DATA
# ==============================================================================
cat("Processing Daily Milk data...\n")
Processing Daily Milk data...
daily_milk_data <- milking %>%
  mutate(
    animal_id = as.character(animal_id),
    daily_yield = as.numeric(daily_yield),
    dim = as.numeric(dim),
    parity = as.numeric(parity)
  ) %>%
  filter(
    !is.na(daily_yield),
    daily_yield > 0.5,
    daily_yield < 60,
    !is.na(dim),
    dim >= 5,
    dim <= 500
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  mutate(
    parity_class = factor(ifelse(is.na(parity) | parity < 1, 1, ifelse(parity > 4, 4, parity))),
    dim_class = factor(ifelse(ceiling(dim / 30) > 17, 17, ceiling(dim / 30))),
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("Daily Milk records after filtering:", format(nrow(daily_milk_data), big.mark = ","), "\n")
Daily Milk records after filtering: 582,060 
# ==============================================================================
# TOTAL LACTATION YIELD (TLY) DATA
# ==============================================================================
cat("Processing TLY data...\n")
Processing TLY data...
tly_data <- lactation %>%
  mutate(
    animal_id = as.character(animal_id),
    total_lactation_yield = as.numeric(total_lactation_yield),
    parity = as.numeric(parity)
  ) %>%
  filter(
    !is.na(total_lactation_yield),
    total_lactation_yield >= 100,
    total_lactation_yield <= 20000
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  mutate(
    parity_class = factor(ifelse(is.na(parity) | parity < 1, 1, ifelse(parity > 4, 4, parity))),
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("TLY records after filtering:", format(nrow(tly_data), big.mark = ","), "\n")
TLY records after filtering: 9,558 
# ==============================================================================
# CALVING INTERVAL (CI) DATA
# ==============================================================================
cat("Processing CI data...\n")
Processing CI data...
ci_data <- lactation %>%
  mutate(
    animal_id = as.character(animal_id),
    calving_interval_days = as.numeric(calving_interval_days),
    parity = as.numeric(parity)
  ) %>%
  filter(
    parity > 1,
    !is.na(calving_interval_days),
    calving_interval_days >= 280,
    calving_interval_days <= 900
  ) %>%
  left_join(pedigree_lookup, by = "animal_id") %>%
  filter(!is.na(region)) %>%
  mutate(
    breed = coalesce(main_breed_name, breed_ped)
  ) %>%
  filter(!is.na(breed), !is.na(farm_id)) %>%
  mutate(
    parity_class = factor(ifelse(parity > 4, 4, parity)),
    region = factor(region),
    farm_id = factor(farm_id),
    breed = factor(breed)
  )

cat("CI records after filtering:", format(nrow(ci_data), big.mark = ","), "\n\n")
CI records after filtering: 44,049 
# ==============================================================================
# SUMMARY TABLE
# ==============================================================================
data.frame(
  Trait = c("Age at First Calving", "Daily Milk Yield", "Total Lactation Yield", "Calving Interval"),
  Abbrev = c("AFC", "DMY", "TLY", "CI"),
  Records = c(nrow(afc_data), nrow(daily_milk_data), nrow(tly_data), nrow(ci_data)),
  Animals = c(n_distinct(afc_data$animal_id), n_distinct(daily_milk_data$animal_id),
              n_distinct(tly_data$animal_id), n_distinct(ci_data$animal_id)),
  Farms = c(n_distinct(afc_data$farm_id), n_distinct(daily_milk_data$farm_id),
            n_distinct(tly_data$farm_id), n_distinct(ci_data$farm_id)),
  Filter = c("Parity=1, 18-72 mo", "0.5-60 kg, DIM 5-500", "100-20,000 kg", "Parity≥2, 280-900 d")
) %>%
  mutate(Records = format(Records, big.mark = ","),
         Animals = format(Animals, big.mark = ","),
         Farms = format(Farms, big.mark = ",")) %>%
  styled_table(caption = "Data available after quality control filtering")
Data available after quality control filtering
Trait Abbrev Records Animals Farms Filter
Age at First Calving AFC 27,503 27,503 1,239 Parity=1, 18-72 mo
Daily Milk Yield DMY 582,060 15,255 647 0.5-60 kg, DIM 5-500
Total Lactation Yield TLY 9,558 4,931 354 100-20,000 kg
Calving Interval CI 44,049 17,527 440 Parity≥2, 280-900 d

7 Interactive Trait Performance Map

7.1 Daily Milk by Region

# ==============================================================================
# TRAIT PERFORMANCE MAP - Daily Milk Yield by Region
# ==============================================================================

# Calculate regional trait means
trait_by_region <- daily_milk_data %>%
  group_by(region) %>%
  summarise(
    n_records = n(),
    n_animals = n_distinct(animal_id),
    mean_milk = round(mean(daily_yield, na.rm = TRUE), 2),
    sd_milk = round(sd(daily_yield, na.rm = TRUE), 2),
    .groups = 'drop'
  ) %>%
  mutate(region = as.character(region))

# Merge with coordinates
trait_map_data <- trait_by_region %>%
  left_join(kenya_coords, by = "region") %>%
  filter(!is.na(lat), !is.na(lng))

if(nrow(trait_map_data) > 0) {
  
  # Color palette for milk yield
  milk_pal <- colorNumeric(
    palette = c("#FEE2E2", "#FECACA", "#FCA5A5", "#F87171", "#EF4444", "#DC2626", "#B91C1C"),
    domain = trait_map_data$mean_milk
  )
  
  # Create popup content
  trait_map_data <- trait_map_data %>%
    mutate(
      popup_trait = paste0(
        "<div style='font-family: sans-serif; padding: 10px; min-width: 200px;'>",
        "<h4 style='color: #1a1f36; margin: 0 0 10px 0; border-bottom: 2px solid #3B82F6; padding-bottom: 5px;'>",
        "<strong>", region, "</strong></h4>",
        "<div style='background: #EFF6FF; padding: 10px; border-radius: 8px; margin-bottom: 10px;'>",
        "<span style='font-size: 24px; font-weight: bold; color: #1e40af;'>", mean_milk, "</span>",
        "<span style='font-size: 14px; color: #3B82F6;'> kg/day</span>",
        "</div>",
        "<table style='width: 100%; font-size: 12px;'>",
        "<tr><td>± SD:</td><td style='font-weight: bold;'>", sd_milk, " kg</td></tr>",
        "<tr><td>Records:</td><td style='font-weight: bold;'>", format(n_records, big.mark = ","), "</td></tr>",
        "<tr><td>Animals:</td><td style='font-weight: bold;'>", format(n_animals, big.mark = ","), "</td></tr>",
        "</table></div>"
      )
    )
  
  leaflet(trait_map_data, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Positron", group = "Light") %>%
    addProviderTiles("Esri.WorldGrayCanvas", group = "Gray") %>%
    
    # Choropleth-style circles
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = ~sqrt(n_animals) / max(sqrt(n_animals)) * 35 + 8,
      color = "#1e40af",
      weight = 2,
      opacity = 0.8,
      fillColor = ~milk_pal(mean_milk),
      fillOpacity = 0.8,
      popup = ~popup_trait,
      label = ~paste0(region, ": ", mean_milk, " kg/day")
    ) %>%
    
    # Add labels with values
    addLabelOnlyMarkers(
      lng = ~lng, lat = ~lat,
      label = ~paste0(round(mean_milk, 1)),
      labelOptions = labelOptions(
        noHide = TRUE,
        direction = "center",
        textOnly = TRUE,
        style = list(
          "color" = "#1e3a8a",
          "font-weight" = "bold",
          "font-size" = "11px"
        )
      )
    ) %>%
    
    addLegend(
      position = "bottomright",
      pal = milk_pal,
      values = ~mean_milk,
      title = "Daily Milk<br>(kg/day)",
      opacity = 0.9
    ) %>%
    
    addLayersControl(
      baseGroups = c("Light", "Gray"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: white; padding: 10px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>🥛 Daily Milk Yield by Region</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Circle size = # animals; Color = yield</span></div>",
      position = "topright"
    )
}

7.2 Multi-Trait Comparison

# ==============================================================================
# MULTI-TRAIT COMPARISON MAP
# ==============================================================================

# Calculate all trait means by region
all_traits_by_region <- bind_rows(
  afc_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(afc_months), n = n(), .groups = 'drop') %>%
    mutate(trait = "AFC", unit = "months"),
  
  daily_milk_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(daily_yield), n = n(), .groups = 'drop') %>%
    mutate(trait = "Daily Milk", unit = "kg/day"),
  
  tly_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(total_lactation_yield), n = n(), .groups = 'drop') %>%
    mutate(trait = "Lactation Yield", unit = "kg"),
  
  ci_data %>% 
    group_by(region) %>% 
    summarise(mean_val = mean(calving_interval_days), n = n(), .groups = 'drop') %>%
    mutate(trait = "Calving Interval", unit = "days")
) %>%
  mutate(region = as.character(region))

# Pivot to wide format for popup
traits_wide <- all_traits_by_region %>%
  select(region, trait, mean_val) %>%
  pivot_wider(names_from = trait, values_from = mean_val) %>%
  left_join(kenya_coords, by = "region") %>%
  filter(!is.na(lat), !is.na(lng))

if(nrow(traits_wide) > 0) {
  
  # Create comprehensive popup
  traits_wide <- traits_wide %>%
    mutate(
      popup_all = paste0(
        "<div style='font-family: sans-serif; padding: 12px; min-width: 220px;'>",
        "<h4 style='color: #1a1f36; margin: 0 0 12px 0; border-bottom: 3px solid #D4AF37; padding-bottom: 8px;'>",
        region, "</h4>",
        
        "<div style='display: grid; grid-template-columns: 1fr 1fr; gap: 8px;'>",
        
        # AFC
        "<div style='background: linear-gradient(135deg, #FFF7ED, #FFEDD5); padding: 10px; border-radius: 8px; border-left: 3px solid #F97316;'>",
        "<div style='font-size: 11px; color: #9A3412;'>AFC</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #C2410C;'>", 
        ifelse(is.na(AFC), "N/A", paste0(round(AFC, 1), " mo")), "</div></div>",
        
        # Daily Milk
        "<div style='background: linear-gradient(135deg, #EFF6FF, #DBEAFE); padding: 10px; border-radius: 8px; border-left: 3px solid #3B82F6;'>",
        "<div style='font-size: 11px; color: #1E40AF;'>Daily Milk</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #1D4ED8;'>", 
        ifelse(is.na(`Daily Milk`), "N/A", paste0(round(`Daily Milk`, 1), " kg")), "</div></div>",
        
        # Lactation Yield
        "<div style='background: linear-gradient(135deg, #ECFDF5, #D1FAE5); padding: 10px; border-radius: 8px; border-left: 3px solid #10B981;'>",
        "<div style='font-size: 11px; color: #065F46;'>Lactation</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #047857;'>", 
        ifelse(is.na(`Lactation Yield`), "N/A", paste0(format(round(`Lactation Yield`, 0), big.mark = ","), " kg")), "</div></div>",
        
        # Calving Interval
        "<div style='background: linear-gradient(135deg, #F5F3FF, #EDE9FE); padding: 10px; border-radius: 8px; border-left: 3px solid #7C3AED;'>",
        "<div style='font-size: 11px; color: #5B21B6;'>CI</div>",
        "<div style='font-size: 18px; font-weight: bold; color: #6D28D9;'>", 
        ifelse(is.na(`Calving Interval`), "N/A", paste0(round(`Calving Interval`, 0), " d")), "</div></div>",
        
        "</div></div>"
      )
    )
  
  leaflet(traits_wide, width = "100%", height = 600) %>%
    addProviderTiles("CartoDB.Voyager", group = "Voyager") %>%
    addProviderTiles("Esri.WorldGrayCanvas", group = "Gray") %>%
    
    addCircleMarkers(
      lng = ~lng, lat = ~lat,
      radius = 18,
      color = "#D4AF37",
      weight = 3,
      opacity = 1,
      fillColor = "#1a1f36",
      fillOpacity = 0.85,
      popup = ~popup_all,
      label = ~region
    ) %>%
    
    addLabelOnlyMarkers(
      lng = ~lng, lat = ~lat,
      label = ~region,
      labelOptions = labelOptions(
        noHide = TRUE,
        direction = "bottom",
        textOnly = TRUE,
        offset = c(0, 15),
        style = list(
          "color" = "#1a1f36",
          "font-weight" = "bold",
          "font-size" = "10px"
        )
      )
    ) %>%
    
    addLayersControl(
      baseGroups = c("Voyager", "Gray"),
      options = layersControlOptions(collapsed = TRUE)
    ) %>%
    
    addScaleBar(position = "bottomleft") %>%
    
    setView(lng = 37.0, lat = -0.5, zoom = 6) %>%
    
    addControl(
      html = "<div style='background: white; padding: 12px 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); font-family: sans-serif;'>
              <strong style='font-size: 16px; color: #1a1f36;'>📊 Multi-Trait Regional Performance</strong><br>
              <span style='font-size: 12px; color: #6B7280;'>Click markers to compare all traits</span></div>",
      position = "topright"
    )
}

8 Descriptive Statistics

8.1 Summary Statistics

data.frame(
  Trait = c("AFC (months)", "Daily Milk (kg)", "Lactation Milk (kg)", "Calving Interval (days)"),
  N = c(nrow(afc_data), nrow(daily_milk_data), nrow(tly_data), nrow(ci_data)),
  Mean = c(mean(afc_data$afc_months), mean(daily_milk_data$daily_yield),
           mean(tly_data$total_lactation_yield), mean(ci_data$calving_interval_days)),
  SD = c(sd(afc_data$afc_months), sd(daily_milk_data$daily_yield),
         sd(tly_data$total_lactation_yield), sd(ci_data$calving_interval_days)),
  Min = c(min(afc_data$afc_months), min(daily_milk_data$daily_yield),
          min(tly_data$total_lactation_yield), min(ci_data$calving_interval_days)),
  Median = c(median(afc_data$afc_months), median(daily_milk_data$daily_yield),
             median(tly_data$total_lactation_yield), median(ci_data$calving_interval_days)),
  Max = c(max(afc_data$afc_months), max(daily_milk_data$daily_yield),
          max(tly_data$total_lactation_yield), max(ci_data$calving_interval_days)),
  CV = c(sd(afc_data$afc_months)/mean(afc_data$afc_months)*100,
         sd(daily_milk_data$daily_yield)/mean(daily_milk_data$daily_yield)*100,
         sd(tly_data$total_lactation_yield)/mean(tly_data$total_lactation_yield)*100,
         sd(ci_data$calving_interval_days)/mean(ci_data$calving_interval_days)*100)
) %>%
  mutate(N = format(N, big.mark = ","),
         across(c(Mean, SD, CV, Min, Median, Max), ~round(., 2))) %>%
  styled_table(caption = "Descriptive statistics for all traits")
Descriptive statistics for all traits
Trait N Mean SD Min Median Max CV
AFC (months) 27,503 39.34 11.90 18.00 36.0 72.0 30.24
Daily Milk (kg) 582,060 9.24 7.07 0.51 9.2 50.0 76.54
Lactation Milk (kg) 9,558 979.05 1706.33 100.00 364.3 19583.5 174.28
Calving Interval (days) 44,049 459.45 130.92 280.00 416.0 900.0 28.49

8.2 Distribution Plots

p_afc <- ggplot(afc_data, aes(x = afc_months)) +
  geom_histogram(aes(y = after_stat(density)), bins = 35, 
                 fill = trait_colors["AFC"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#9a3412") +
  geom_vline(xintercept = mean(afc_data$afc_months), linetype = "dashed", 
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(afc_data$afc_months) + 3, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", round(mean(afc_data$afc_months), 1), " mo"),
           fontface = "bold", size = 3.5) +
  labs(title = "Age at First Calving", x = "AFC (months)", y = "Density")

p_dm <- ggplot(daily_milk_data, aes(x = daily_yield)) +
  geom_histogram(aes(y = after_stat(density)), bins = 45, 
                 fill = trait_colors["Daily Milk"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#1e3a8a") +
  geom_vline(xintercept = mean(daily_milk_data$daily_yield), linetype = "dashed",
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(daily_milk_data$daily_yield) + 2, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", round(mean(daily_milk_data$daily_yield), 1), " kg"),
           fontface = "bold", size = 3.5) +
  labs(title = "Daily Milk Yield", x = "Daily Yield (kg)", y = "Density")

p_tly <- ggplot(tly_data, aes(x = total_lactation_yield)) +
  geom_histogram(aes(y = after_stat(density)), bins = 45, 
                 fill = trait_colors["Lactation Milk"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#065f46") +
  geom_vline(xintercept = mean(tly_data$total_lactation_yield), linetype = "dashed",
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(tly_data$total_lactation_yield) + 500, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", format(round(mean(tly_data$total_lactation_yield), 0), big.mark = ","), " kg"),
           fontface = "bold", size = 3.5) +
  scale_x_continuous(labels = comma) +
  labs(title = "Total Lactation Yield", x = "Lactation Yield (kg)", y = "Density")

p_ci <- ggplot(ci_data, aes(x = calving_interval_days)) +
  geom_histogram(aes(y = after_stat(density)), bins = 40, 
                 fill = trait_colors["Calving Interval"], color = "white", alpha = 0.85) +
  geom_density(linewidth = 1.2, color = "#4c1d95") +
  geom_vline(xintercept = mean(ci_data$calving_interval_days), linetype = "dashed",
             color = "#1a1f36", linewidth = 1) +
  annotate("text", x = mean(ci_data$calving_interval_days) + 30, y = Inf, vjust = 2, hjust = 0,
           label = paste0("Mean: ", round(mean(ci_data$calving_interval_days), 0), " d"),
           fontface = "bold", size = 3.5) +
  labs(title = "Calving Interval", x = "Calving Interval (days)", y = "Density")

(p_afc + p_dm) / (p_tly + p_ci) +
  plot_annotation(
    title = "Distribution of Dairy Cattle Traits in Kenya",
    subtitle = "Dashed lines indicate population means",
    theme = theme(plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
                  plot.subtitle = element_text(size = 12, hjust = 0.5, color = "#6B7280"))
  )

8.3 Box Plots by Region

# Boxplots with improved spacing
p1_box <- ggplot(afc_data, aes(x = reorder(region, afc_months, FUN = median), y = afc_months)) +
  geom_boxplot(fill = trait_colors["AFC"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "darkred") +
  stat_summary(fun = mean, geom = "text", aes(label = round(after_stat(y), 1)), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Age at First Calving by Region", x = NULL, y = "AFC (months)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

p2_box <- ggplot(daily_milk_data, aes(x = reorder(region, daily_yield, FUN = median), y = daily_yield)) +
  geom_boxplot(fill = trait_colors["Daily Milk"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "darkblue") +
  stat_summary(fun = mean, geom = "text", aes(label = round(after_stat(y), 1)), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Daily Milk Yield by Region", x = NULL, y = "Daily Milk (kg)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

p3_box <- ggplot(tly_data, aes(x = reorder(region, total_lactation_yield, FUN = median), y = total_lactation_yield)) +
  geom_boxplot(fill = trait_colors["Lactation Milk"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "darkgreen") +
  stat_summary(fun = mean, geom = "text", aes(label = format(round(after_stat(y), 0), big.mark = ",")), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Total Lactation Yield by Region", x = NULL, y = "Lactation Milk (kg)") +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

p4_box <- ggplot(ci_data, aes(x = reorder(region, calving_interval_days, FUN = median), y = calving_interval_days)) +
  geom_boxplot(fill = trait_colors["Calving Interval"], alpha = 0.7, outlier.size = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, color = "purple4") +
  stat_summary(fun = mean, geom = "text", aes(label = round(after_stat(y), 0)), 
               vjust = -1, size = 3.5, fontface = "bold") +
  labs(title = "Calving Interval by Region", x = NULL, y = "CI (days)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
        plot.margin = margin(10, 10, 10, 20))

(p1_box + p2_box) / (p3_box + p4_box) +
  plot_annotation(
    title = "Trait Distribution by Region",
    subtitle = "Diamond markers show mean values; numbers above show regional means",
    theme = theme(plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
                  plot.subtitle = element_text(size = 11, hjust = 0.5, color = "#6B7280"))
  )

8.4 Lactation Curve

dim_summary <- daily_milk_data %>%
  mutate(dim_class_num = as.numeric(as.character(dim_class))) %>%
  group_by(dim_class_num) %>%
  summarise(N = n(), Mean = mean(daily_yield), SD = sd(daily_yield),
            SE = SD / sqrt(N), Lower = Mean - 1.96 * SE, Upper = Mean + 1.96 * SE,
            .groups = 'drop') %>%
  mutate(DIM_midpoint = (dim_class_num - 0.5) * 30)

peak_row <- dim_summary[which.max(dim_summary$Mean), ]

ggplot(dim_summary, aes(x = DIM_midpoint, y = Mean)) +
  geom_ribbon(aes(ymin = Lower, ymax = Upper), fill = trait_colors["Daily Milk"], alpha = 0.25) +
  geom_line(linewidth = 1.5, color = trait_colors["Daily Milk"]) +
  geom_point(size = 3.5, color = trait_colors["Daily Milk"], fill = "white", shape = 21, stroke = 1.5) +
  geom_vline(xintercept = peak_row$DIM_midpoint, linetype = "dashed", color = "#dc2626", linewidth = 0.8) +
  geom_point(data = peak_row, aes(x = DIM_midpoint, y = Mean), size = 6, color = "#dc2626", shape = 18) +
  annotate("text", x = peak_row$DIM_midpoint + 35, y = peak_row$Mean + 0.5,
           label = paste0("Peak: ", round(peak_row$Mean, 1), " kg\n@ DIM ", round(peak_row$DIM_midpoint, 0)),
           color = "#dc2626", fontface = "bold", size = 4) +
  scale_x_continuous(breaks = seq(0, 500, 50), limits = c(0, 520)) +
  labs(title = "Average Lactation Curve of Kenya Dairy Cattle",
       subtitle = "Mean daily milk yield (±95% CI) across days in milk (DIM)",
       x = "Days in Milk (DIM)", y = "Daily Milk Yield (kg)",
       caption = paste("Based on", format(nrow(daily_milk_data), big.mark = ","), "test-day records"))


9 Fixed Effects Analysis

9.1 Linear Models

afc_model <- lm(afc_months ~ region + breed, data = afc_data)
dm_model <- lm(daily_yield ~ region + breed + parity_class + dim_class, data = daily_milk_data)
tly_model <- lm(total_lactation_yield ~ region + breed + parity_class, data = tly_data)
ci_model <- lm(calving_interval_days ~ region + breed + parity_class, data = ci_data)

data.frame(
  Trait = c("AFC", "Daily Milk", "Lactation Yield", "Calving Interval"),
  Model = c("Region + Breed", "Region + Breed + Parity + DIM",
            "Region + Breed + Parity", "Region + Breed + Parity"),
  N = c(nrow(afc_data), nrow(daily_milk_data), nrow(tly_data), nrow(ci_data)),
  R2 = c(summary(afc_model)$r.squared, summary(dm_model)$r.squared,
         summary(tly_model)$r.squared, summary(ci_model)$r.squared) * 100,
  Adj_R2 = c(summary(afc_model)$adj.r.squared, summary(dm_model)$adj.r.squared,
             summary(tly_model)$adj.r.squared, summary(ci_model)$adj.r.squared) * 100
) %>%
  mutate(N = format(N, big.mark = ","),
         R2 = paste0(round(R2, 2), "%"),
         Adj_R2 = paste0(round(Adj_R2, 2), "%")) %>%
  rename(`R² (%)` = R2, `Adj. R² (%)` = Adj_R2) %>%
  styled_table(caption = "Variance explained by fixed effects models")
Variance explained by fixed effects models
Trait Model N R² (%) Adj. R² (%)
AFC Region + Breed 27,503 22.07% 21.9%
Daily Milk Region + Breed + Parity + DIM 582,060 53.8% 53.8%
Lactation Yield Region + Breed + Parity 9,558 36.27% 35.98%
Calving Interval Region + Breed + Parity 44,049 6.3% 6.18%

9.2 R² Visualization

r2_data <- data.frame(
  Trait = c("AFC", "Daily Milk", "Lactation Yield", "Calving Interval"),
  R_squared = c(summary(afc_model)$r.squared, summary(dm_model)$r.squared,
                summary(tly_model)$r.squared, summary(ci_model)$r.squared) * 100
)

ggplot(r2_data, aes(x = reorder(Trait, R_squared), y = R_squared, fill = Trait)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = paste0(round(R_squared, 1), "%")), hjust = -0.2, size = 5, fontface = "bold") +
  scale_fill_manual(values = trait_colors) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)), limits = c(0, 100)) +
  coord_flip() +
  labs(title = "Variance Explained by Fixed Effects",
       subtitle = "DIM class has major impact on daily milk yield",
       x = NULL, y = "R-squared (%)")


10 Key Findings

10.0.1 📊 Major Findings

  1. Breed Effects: Holstein-Friesian and improved crossbreds outperform indigenous breeds
  2. Regional Variation: Significant differences exist between regions - visualized in interactive maps
  3. Lactation Dynamics: Peak production occurs around DIM 30-60
  4. Fixed Effects Impact: DIM explains substantial variation in daily milk yields
  5. Geographic Clustering: Major dairy production concentrated in Central and Rift Valley regions

11 Session Information

sessionInfo()
R version 4.5.1 (2025-06-13 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_Kenya.utf8  LC_CTYPE=English_Kenya.utf8   
[3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C                  
[5] LC_TIME=English_Kenya.utf8    

time zone: Africa/Nairobi
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] RColorBrewer_1.1-3   htmltools_0.5.8.1    sf_1.0-24           
 [4] leaflet.extras_2.0.1 leaflet_2.2.3        patchwork_1.3.2     
 [7] broom_1.0.9          scales_1.4.0         kableExtra_1.4.0    
[10] knitr_1.50           ggplot2_3.5.2        tidyr_1.3.1         
[13] dplyr_1.1.4          data.table_1.17.8   

loaded via a namespace (and not attached):
 [1] sass_0.4.10             generics_0.1.4          class_7.3-23           
 [4] xml2_1.4.0              KernSmooth_2.23-26      stringi_1.8.7          
 [7] digest_0.6.37           magrittr_2.0.3          evaluate_1.0.5         
[10] grid_4.5.1              fastmap_1.2.0           jsonlite_2.0.0         
[13] e1071_1.7-17            backports_1.5.0         DBI_1.2.3              
[16] purrr_1.1.0             crosstalk_1.2.2         viridisLite_0.4.2      
[19] textshaping_1.0.2       jquerylib_0.1.4         cli_3.6.5              
[22] rlang_1.1.6             units_1.0-0             withr_3.0.2            
[25] cachem_1.1.0            yaml_2.3.10             tools_4.5.1            
[28] mime_0.13               vctrs_0.6.5             R6_2.6.1               
[31] proxy_0.4-29            lifecycle_1.0.4         classInt_0.4-11        
[34] stringr_1.5.1           leaflet.providers_2.0.0 htmlwidgets_1.6.4      
[37] pkgconfig_2.0.3         pillar_1.11.0           bslib_0.9.0            
[40] gtable_0.3.6            Rcpp_1.1.0              glue_1.8.0             
[43] systemfonts_1.2.3       xfun_0.52               tibble_3.3.0           
[46] tidyselect_1.2.1        rstudioapi_0.17.1       farver_2.1.2           
[49] labeling_0.4.3          rmarkdown_2.29          svglite_2.2.1          
[52] compiler_4.5.1