# Load required packages
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)

# Set theme
theme_set(theme_bw(base_size = 12))

Executive Summary

This report provides a comprehensive data quality assessment for the Kenya ADGG/TDGG dairy datasets. It identifies:

  • Outliers in key traits (AFC, milk yield, calving interval, etc.)
  • Impossible/implausible values (negative ages, extreme yields)
  • Data entry errors (dates in wrong fields, miscoded values)
  • Missing data patterns
  • Recommendations for data cleaning

1 Data Loading

# SET YOUR FILE PATH HERE
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/"

# Read data - treating "NULL" as NA
lactation <- fread(paste0(data_path, "lactation_calving.csv"), 
                   na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                   stringsAsFactors = FALSE)

milking <- fread(paste0(data_path, "milking_data.csv"), 
                 na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                 stringsAsFactors = FALSE)

# Convert numeric columns
lactation <- lactation %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    calving_interval_days = as.numeric(calving_interval_days),
    calving_interval_months = as.numeric(calving_interval_months),
    lactation_length_days = as.numeric(lactation_length_days),
    total_lactation_yield = as.numeric(total_lactation_yield),
    testday_count = as.numeric(testday_count),
    days_open = as.numeric(days_open)
  )

milking <- milking %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    dim = as.numeric(dim),
    daily_yield = as.numeric(daily_yield),
    milk_morning = as.numeric(milk_morning),
    milk_evening = as.numeric(milk_evening),
    milk_midday = as.numeric(milk_midday),
    testday_no = as.numeric(testday_no),
    calving_age_months = as.numeric(calving_age_months)
  )

cat("Lactation data:", format(nrow(lactation), big.mark = ","), "records\n")
## Lactation data: 88,352 records
cat("Milking data:", format(nrow(milking), big.mark = ","), "records\n")
## Milking data: 1,048,575 records

2 Overall Descriptive Statistics (Raw Data)

This section shows the raw statistics including outliers to highlight data quality issues.

# Function to calculate comprehensive stats with NA handling
calc_stats <- function(x, var_name) {
  x_valid <- x[!is.na(x)]
  n_total <- length(x)
  n_valid <- length(x_valid)
  n_missing <- sum(is.na(x))
  
  if (n_valid == 0) {
    return(data.frame(
      Variable = var_name,
      N_Total = n_total,
      N_Valid = 0,
      N_Missing = n_missing,
      Pct_Missing = 100,
      Min = NA, Max = NA, Mean = NA, SD = NA, Median = NA
    ))
  }
  
  data.frame(
    Variable = var_name,
    N_Total = n_total,
    N_Valid = n_valid,
    N_Missing = n_missing,
    Pct_Missing = round(100 * n_missing / n_total, 2),
    Min = round(min(x_valid), 2),
    Max = round(max(x_valid), 2),
    Mean = round(mean(x_valid), 2),
    SD = round(sd(x_valid), 2),
    Median = round(median(x_valid), 2)
  )
}

# Lactation traits - RAW (including outliers)
lact_stats_raw <- rbind(
  calc_stats(lactation$afc_months[lactation$parity == 1], "AFC (months) - Parity 1"),
  calc_stats(lactation$afc_days[lactation$parity == 1], "AFC (days) - Parity 1"),
  calc_stats(lactation$calving_interval_days[lactation$parity > 1], "Calving Interval (days)"),
  calc_stats(lactation$lactation_length_days, "Lactation Length (days)"),
  calc_stats(lactation$total_lactation_yield, "Total Lactation Yield (kg)"),
  calc_stats(lactation$days_open, "Days Open"),
  calc_stats(lactation$parity, "Parity"),
  calc_stats(lactation$testday_count, "Test-day Count")
)

kable(lact_stats_raw, 
      caption = "Lactation Data - RAW Statistics (Including Outliers)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = "red", bold = TRUE) %>%
  column_spec(7, color = "red", bold = TRUE)
Lactation Data - RAW Statistics (Including Outliers)
Variable N_Total N_Valid N_Missing Pct_Missing Min Max Mean SD Median
AFC (months) - Parity 1 37,598 33,015 4,583 12.19 -394 24,295.0 50.96 364.55 38
AFC (days) - Parity 1 37,598 33,015 4,583 12.19 -11,956 738,588.0 1,563.98 11,082.36 1,158
Calving Interval (days) 51,116 50,463 653 1.28 0 16,530.0 521.70 342.46 424
Lactation Length (days) 88,352 50,707 37,645 42.61 0 669,342.0 535.21 2,990.38 424
Total Lactation Yield (kg) 88,352 88,347 5 0.01 0 23,186.4 112.02 642.21 0
Days Open 88,352 50,710 37,642 42.60 -280 669,062.0 255.21 2,990.29 144
Parity 88,352 87,990 362 0.41 1 32.0 2.42 1.82 2
Test-day Count 88,352 88,352 0 0.00 0 2,362.0 19.61 68.67 0

⚠️ KEY OUTLIERS VISIBLE IN MIN/MAX:

  • AFC: Min = -394 months (NEGATIVE!), Max = 24,295 months
  • Calving Interval: Min = 0 days, Max = 16,530 days
  • Days Open: Min = -280 days (NEGATIVE!), Max = 669,062 days
# Milking traits - RAW
milk_stats_raw <- rbind(
  calc_stats(milking$daily_yield, "Daily Milk Yield (kg)"),
  calc_stats(milking$milk_morning, "Morning Milk (kg)"),
  calc_stats(milking$milk_evening, "Evening Milk (kg)"),
  calc_stats(milking$dim, "Days in Milk (DIM)"),
  calc_stats(milking$parity, "Parity"),
  calc_stats(milking$afc_months, "AFC (months)"),
  calc_stats(milking$testday_no, "Test-day Number")
)

kable(milk_stats_raw, 
      caption = "Milking Data - RAW Statistics (Including Outliers)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = "red", bold = TRUE) %>%
  column_spec(7, color = "red", bold = TRUE)
Milking Data - RAW Statistics (Including Outliers)
Variable N_Total N_Valid N_Missing Pct_Missing Min Max Mean SD Median
Daily Milk Yield (kg) 1,048,575 1,048,546 29 0.00 0 50 5.51 6.99 1.38
Morning Milk (kg) 1,048,575 1,048,415 160 0.02 0 459 1.90 3.84 0.00
Evening Milk (kg) 1,048,575 1,048,413 162 0.02 -1 29 1.81 3.18 0.00
Days in Milk (DIM) 1,048,575 1,048,575 0 0.00 -729,939 7,752 220.35 2,008.49 168.00
Parity 1,048,575 1,048,403 172 0.02 1 32 2.66 1.99 2.00
AFC (months) 1,048,575 389,867 658,708 62.82 -288 270 38.29 19.43 33.00
Test-day Number 1,048,575 1,017,335 31,240 2.98 1 1,514 63.66 123.64 18.00

⚠️ KEY OUTLIERS VISIBLE IN MIN/MAX:

  • DIM: Min = -729,939 days (EXTREME NEGATIVE!), Max = 7,752 days
  • Evening Milk: Min = -1 kg (NEGATIVE!)
  • AFC in milking: Min = -288 months (NEGATIVE!)

3 Cleaned Statistics (After Removing Outliers)

Comparing raw vs cleaned statistics to show the impact of outliers.

# Define cleaning thresholds
# AFC: 18-72 months
# Calving Interval: 300-700 days
# Lactation Length: 60-500 days
# Total Lactation Yield: > 0 and <= 15000 kg
# Days Open: 21-400 days
# Parity: 1-12
# Daily Yield: 0.5-50 kg
# DIM: 5-500 days

# Lactation - CLEANED stats
lact_stats_clean <- rbind(
  calc_stats(lactation$afc_months[lactation$parity == 1 & 
                                   lactation$afc_months >= 18 & 
                                   lactation$afc_months <= 72], 
             "AFC (months) - CLEANED"),
  calc_stats(lactation$calving_interval_days[lactation$parity > 1 & 
                                              lactation$calving_interval_days >= 300 & 
                                              lactation$calving_interval_days <= 700], 
             "Calving Interval (days) - CLEANED"),
  calc_stats(lactation$lactation_length_days[lactation$lactation_length_days >= 60 & 
                                              lactation$lactation_length_days <= 500], 
             "Lactation Length (days) - CLEANED"),
  calc_stats(lactation$total_lactation_yield[lactation$total_lactation_yield > 0 & 
                                              lactation$total_lactation_yield <= 15000], 
             "Total Lactation Yield (kg) - CLEANED"),
  calc_stats(lactation$days_open[lactation$days_open >= 21 & 
                                  lactation$days_open <= 400], 
             "Days Open - CLEANED"),
  calc_stats(lactation$parity[lactation$parity >= 1 & lactation$parity <= 12], 
             "Parity - CLEANED")
)

kable(lact_stats_clean, 
      caption = "Lactation Data - CLEANED Statistics (Outliers Removed)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
Lactation Data - CLEANED Statistics (Outliers Removed)
Variable N_Total N_Valid N_Missing Pct_Missing Min Max Mean SD Median
AFC (months) - CLEANED 32,804 28,229 4,575 13.95 18.00 72.0 39.36 11.89 36
Calving Interval (days) - CLEANED 41,843 41,248 595 1.42 300.00 700.0 433.45 92.15 408
Lactation Length (days) - CLEANED 71,218 33,573 37,645 52.86 60.00 500.0 386.08 60.86 383
Total Lactation Yield (kg) - CLEANED 37,189 37,184 5 0.01 0.02 14,973.5 261.79 927.82 19
Days Open - CLEANED 78,444 40,802 37,642 47.99 21.00 400.0 150.28 87.90 126
Parity - CLEANED 88,289 87,927 362 0.41 1.00 12.0 2.41 1.76 2
# Milking - CLEANED stats
milk_stats_clean <- rbind(
  calc_stats(milking$daily_yield[milking$daily_yield >= 0.5 & milking$daily_yield <= 50], 
             "Daily Milk Yield (kg) - CLEANED"),
  calc_stats(milking$milk_morning[milking$milk_morning > 0 & milking$milk_morning <= 30], 
             "Morning Milk (kg) - CLEANED"),
  calc_stats(milking$milk_evening[milking$milk_evening > 0 & milking$milk_evening <= 20], 
             "Evening Milk (kg) - CLEANED"),
  calc_stats(milking$dim[milking$dim >= 5 & milking$dim <= 500], 
             "DIM - CLEANED"),
  calc_stats(milking$afc_months[milking$afc_months >= 18 & milking$afc_months <= 72], 
             "AFC (months) - CLEANED")
)

kable(milk_stats_clean, 
      caption = "Milking Data - CLEANED Statistics (Outliers Removed)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
Milking Data - CLEANED Statistics (Outliers Removed)
Variable N_Total N_Valid N_Missing Pct_Missing Min Max Mean SD Median
Daily Milk Yield (kg) - CLEANED 675,222 675,193 29 0.00 0.50 50 8.54 7.08 8.1
Morning Milk (kg) - CLEANED 493,797 493,637 160 0.03 0.01 28 4.00 3.59 2.9
Evening Milk (kg) - CLEANED 453,383 453,221 162 0.04 0.01 20 4.19 3.67 3.4
DIM - CLEANED 957,939 957,939 0 0.00 5.00 500 169.61 112.75 156.0
AFC (months) - CLEANED 1,024,502 365,794 658,708 64.30 18.00 72 35.23 9.72 33.0

4 Comparison: Raw vs Cleaned Statistics

# Create comparison table
comparison <- data.frame(
  Trait = c("AFC (months)", "Calving Interval (days)", "Lactation Length (days)", 
            "Total Lactation Yield (kg)", "Days Open", "Daily Milk Yield (kg)", "DIM"),
  
  Raw_N = c(
    sum(!is.na(lactation$afc_months) & lactation$parity == 1),
    sum(!is.na(lactation$calving_interval_days) & lactation$parity > 1),
    sum(!is.na(lactation$lactation_length_days)),
    sum(!is.na(lactation$total_lactation_yield)),
    sum(!is.na(lactation$days_open)),
    sum(!is.na(milking$daily_yield)),
    sum(!is.na(milking$dim))
  ),
  
  Raw_Min = c(
    round(min(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(min(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(min(lactation$lactation_length_days, na.rm = TRUE), 1),
    round(min(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(min(lactation$days_open, na.rm = TRUE), 1),
    round(min(milking$daily_yield, na.rm = TRUE), 2),
    round(min(milking$dim, na.rm = TRUE), 0)
  ),
  
  Raw_Max = c(
    round(max(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(max(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(max(lactation$lactation_length_days, na.rm = TRUE), 1),
    round(max(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(max(lactation$days_open, na.rm = TRUE), 1),
    round(max(milking$daily_yield, na.rm = TRUE), 2),
    round(max(milking$dim, na.rm = TRUE), 0)
  ),
  
  Raw_Mean = c(
    round(mean(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 2),
    round(mean(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 2),
    round(mean(lactation$lactation_length_days, na.rm = TRUE), 2),
    round(mean(lactation$total_lactation_yield, na.rm = TRUE), 2),
    round(mean(lactation$days_open, na.rm = TRUE), 2),
    round(mean(milking$daily_yield, na.rm = TRUE), 2),
    round(mean(milking$dim, na.rm = TRUE), 2)
  ),
  
  Clean_N = c(
    sum(lactation$parity == 1 & lactation$afc_months >= 18 & lactation$afc_months <= 72, na.rm = TRUE),
    sum(lactation$parity > 1 & lactation$calving_interval_days >= 300 & lactation$calving_interval_days <= 700, na.rm = TRUE),
    sum(lactation$lactation_length_days >= 60 & lactation$lactation_length_days <= 500, na.rm = TRUE),
    sum(lactation$total_lactation_yield > 0 & lactation$total_lactation_yield <= 15000, na.rm = TRUE),
    sum(lactation$days_open >= 21 & lactation$days_open <= 400, na.rm = TRUE),
    sum(milking$daily_yield >= 0.5 & milking$daily_yield <= 50, na.rm = TRUE),
    sum(milking$dim >= 5 & milking$dim <= 500, na.rm = TRUE)
  ),
  
  Clean_Mean = c(
    round(mean(lactation$afc_months[lactation$parity == 1 & lactation$afc_months >= 18 & lactation$afc_months <= 72], na.rm = TRUE), 2),
    round(mean(lactation$calving_interval_days[lactation$parity > 1 & lactation$calving_interval_days >= 300 & lactation$calving_interval_days <= 700], na.rm = TRUE), 2),
    round(mean(lactation$lactation_length_days[lactation$lactation_length_days >= 60 & lactation$lactation_length_days <= 500], na.rm = TRUE), 2),
    round(mean(lactation$total_lactation_yield[lactation$total_lactation_yield > 0 & lactation$total_lactation_yield <= 15000], na.rm = TRUE), 2),
    round(mean(lactation$days_open[lactation$days_open >= 21 & lactation$days_open <= 400], na.rm = TRUE), 2),
    round(mean(milking$daily_yield[milking$daily_yield >= 0.5 & milking$daily_yield <= 50], na.rm = TRUE), 2),
    round(mean(milking$dim[milking$dim >= 5 & milking$dim <= 500], na.rm = TRUE), 2)
  ),
  
  Filter_Applied = c(
    "18-72 months",
    "300-700 days",
    "60-500 days",
    ">0 & ≤15,000 kg",
    "21-400 days",
    "0.5-50 kg",
    "5-500 days"
  )
)

comparison <- comparison %>%
  mutate(
    Records_Lost = Raw_N - Clean_N,
    Pct_Lost = round(100 * Records_Lost / Raw_N, 2)
  )

kable(comparison, 
      caption = "Comparison: Raw vs Cleaned Statistics",
      col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", 
                    "Clean N", "Clean Mean", "Filter", "Records Lost", "% Lost"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, color = "red") %>%
  column_spec(4, color = "red") %>%
  column_spec(10, color = "blue", bold = TRUE)
Comparison: Raw vs Cleaned Statistics
Trait Raw N Raw Min Raw Max Raw Mean Clean N Clean Mean Filter Records Lost % Lost
AFC (months) NA -394 24,295.0 50.96 28,229 39.36 18-72 months NA NA
Calving Interval (days) NA 0 16,530.0 521.70 41,248 433.45 300-700 days NA NA
Lactation Length (days) 50,707 0 669,342.0 535.21 33,573 386.08 60-500 days 17,134 33.79
Total Lactation Yield (kg) 88,347 0 23,186.4 112.02 37,184 261.79 >0 & ≤15,000 kg 51,163 57.91
Days Open 50,710 -280 669,062.0 255.21 40,802 150.28 21-400 days 9,908 19.54
Daily Milk Yield (kg) 1,048,546 0 50.0 5.51 675,193 8.54 0.5-50 kg 373,353 35.61
DIM 1,048,575 -729,939 7,752.0 220.35 957,939 169.61 5-500 days 90,636 8.64

5 Overview of Data Issues

# Create summary of issues
issues_summary <- data.frame(
  Issue_Category = c(
    "Negative AFC values",
    "AFC > 100 months (8+ years)",
    "AFC < 18 months (too young)",
    "Negative calving interval",
    "Calving interval > 1000 days",
    "Negative days in milk",
    "DIM > 1000 days",
    "Daily yield = 0",
    "Daily yield > 50 kg",
    "Negative daily yield",
    "Total lactation yield = 0",
    "Total lactation yield > 20,000 kg",
    "Negative days open",
    "Sex field contains dates",
    "Animal type = 'FEMALE' (misplaced)"
  ),
  Dataset = c(rep("Lactation", 5), rep("Milking", 6), rep("Lactation", 4)),
  Count = c(
    sum(lactation$afc_months < 0, na.rm = TRUE),
    sum(lactation$afc_months > 100, na.rm = TRUE),
    sum(lactation$afc_months > 0 & lactation$afc_months < 18, na.rm = TRUE),
    sum(lactation$calving_interval_days < 0, na.rm = TRUE),
    sum(lactation$calving_interval_days > 1000, na.rm = TRUE),
    sum(milking$dim < 0, na.rm = TRUE),
    sum(milking$dim > 1000, na.rm = TRUE),
    sum(milking$daily_yield == 0, na.rm = TRUE),
    sum(milking$daily_yield > 50, na.rm = TRUE),
    sum(milking$daily_yield < 0, na.rm = TRUE),
    sum(lactation$total_lactation_yield == 0, na.rm = TRUE),
    sum(lactation$total_lactation_yield > 20000, na.rm = TRUE),
    sum(lactation$days_open < 0, na.rm = TRUE),
    sum(grepl("/", lactation$sex), na.rm = TRUE),
    sum(lactation$animal_type == "FEMALE", na.rm = TRUE)
  )
)

issues_summary <- issues_summary %>%
  mutate(
    Percentage = round(100 * Count / ifelse(Dataset == "Lactation", nrow(lactation), nrow(milking)), 3)
  ) %>%
  arrange(desc(Count))

kable(issues_summary, 
      caption = "Summary of Data Quality Issues",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE) %>%
  row_spec(which(issues_summary$Count > 1000), bold = TRUE, color = "red")
Summary of Data Quality Issues
Issue_Category Dataset Count Percentage
Daily yield = 0 Milking 333,069 31.764
Total lactation yield = 0 Milking 51,154 4.878
DIM > 1000 days Milking 22,267 2.124
Calving interval > 1000 days Lactation 3,091 3.499
AFC > 100 months (8+ years) Lactation 2,280 2.581
Negative days open Lactation 1,531 1.733
AFC < 18 months (too young) Lactation 723 0.818
Negative AFC values Lactation 480 0.543
Negative days in milk Milking 10 0.001
Sex field contains dates Lactation 8 0.009
Animal type = ‘FEMALE’ (misplaced) Lactation 8 0.009
Total lactation yield > 20,000 kg Lactation 1 0.001
Negative calving interval Lactation 0 0.000
Daily yield > 50 kg Milking 0 0.000
Negative daily yield Milking 0 0.000

6 Age at First Calving (AFC) Issues

6.1 Distribution with Outliers

# AFC distribution showing all values including outliers
afc_data <- lactation %>%
  filter(parity == 1, !is.na(afc_months)) %>%
  mutate(
    afc_category = case_when(
      afc_months < 0 ~ "Negative (ERROR)",
      afc_months >= 0 & afc_months < 18 ~ "Too Young (<18 mo)",
      afc_months >= 18 & afc_months <= 72 ~ "Normal (18-72 mo)",
      afc_months > 72 & afc_months <= 100 ~ "Late (72-100 mo)",
      afc_months > 100 ~ "Extreme (>100 mo)"
    )
  )

# Summary table with Min, Max, Mean, SD
afc_summary <- afc_data %>%
  group_by(afc_category) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months, na.rm = TRUE), 1),
    Max = round(max(afc_months, na.rm = TRUE), 1),
    Mean = round(mean(afc_months, na.rm = TRUE), 2),
    SD = round(sd(afc_months, na.rm = TRUE), 2),
    Median = round(median(afc_months, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(afc_summary, 
      caption = "AFC Value Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
AFC Value Categories with Min/Max/Mean
afc_category N Min Max Mean SD Median Percentage
Extreme (>100 mo) 1,481 101 24,295 249.59 1,706.99 117 4.49
Late (72-100 mo) 2,495 73 100 84.47 8.11 84 7.56
Negative (ERROR) 127 -394 -1 -102.76 106.54 -56 0.38
Normal (18-72 mo) 28,229 18 72 39.36 11.89 36 85.50
Too Young (<18 mo) 683 0 17 5.74 6.30 3 2.07
# Plot 1: Full distribution with log scale
p1 <- ggplot(afc_data, aes(x = afc_months)) +
  geom_histogram(aes(fill = afc_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Too Young (<18 mo)" = "orange",
    "Normal (18-72 mo)" = "darkgreen",
    "Late (72-100 mo)" = "yellow",
    "Extreme (>100 mo)" = "purple"
  )) +
  geom_vline(xintercept = c(0, 18, 72), linetype = "dashed", color = "black", linewidth = 0.8) +
  annotate("text", x = 0, y = Inf, label = "0", vjust = 2, hjust = 0.5, size = 3) +
  annotate("text", x = 18, y = Inf, label = "18", vjust = 2, hjust = 0.5, size = 3) +
  annotate("text", x = 72, y = Inf, label = "72", vjust = 2, hjust = 0.5, size = 3) +
  labs(
    title = "Age at First Calving - Full Distribution",
    subtitle = paste0("Min = ", round(min(afc_data$afc_months), 1), " | Max = ", 
                      format(round(max(afc_data$afc_months), 1), big.mark = ","), 
                      " | Mean = ", round(mean(afc_data$afc_months), 1), " months"),
    x = "AFC (months)",
    y = "Count",
    fill = "Category"
  ) +
  theme(legend.position = "bottom") +
  scale_y_log10(labels = comma)

# Plot 2: Boxplot showing outliers
p2 <- ggplot(afc_data, aes(y = afc_months)) +
  geom_boxplot(fill = "steelblue", outlier.color = "red", outlier.size = 1) +
  labs(
    title = "AFC Boxplot - Outliers in Red",
    y = "AFC (months)"
  ) +
  coord_flip()

grid.arrange(p1, p2, ncol = 1, heights = c(2, 1))

6.2 Extreme AFC Values

# Show records with extreme AFC values
extreme_afc <- lactation %>%
  filter(parity == 1, !is.na(afc_months)) %>%
  filter(afc_months < 0 | afc_months > 100) %>%
  select(animal_id, farm_id, project_name, main_breed_name, 
         afc_months, afc_days, birthdate, calving_date) %>%
  arrange(afc_months) %>%
  head(50)

kable(extreme_afc, 
      caption = "Sample of Extreme AFC Values (Negative or > 100 months)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = TRUE) %>%
  scroll_box(height = "400px")
Sample of Extreme AFC Values (Negative or > 100 months)
animal_id farm_id project_name main_breed_name afc_months afc_days birthdate calving_date
176081 514151 NA Holstein Friesian -394 -11956 04/09/2018 7/15/1985
175712 514149 NA Holstein Friesian -358 -10868 04/09/2018 07/07/1988
175675 515076 NA Local Zebu -344 -10445 9/20/2017 2/14/1989
187786 510092 NA Sahiwal -309 -9378 7/16/2016 11/12/1990
178409 515077 NA Unknown -306 -9297 09/10/2019 3/28/1994
195491 598285 NA Ayrshire -296 -8998 02/01/2019 6/14/1994
178911 598285 NA Holstein Friesian -292 -8848 02/01/2020 11/11/1995
170328 595412 NA Ayrshire -291 -8822 05/08/2018 3/13/1994
178404 598285 NA Holstein Friesian -288 -8748 02/01/2020 2/19/1996
170285 515160 NA Ayrshire -273 -8282 7/27/2016 11/23/1993
171698 616608 NA Holstein Friesian -273 -8278 6/15/2016 10/16/1993
176871 597275 NA Local Zebu -272 -8256 10/21/2010 3/14/1988
170515 510092 NA Sahiwal -261 -7917 01/07/2018 05/05/1996
746443 601235 KLBA Ayrshire -258 -7831 05/04/2005 11/25/1983
335377 347826 STANLEY AND SON LIMITED - KENYA Boran -257 -7801 4/28/2011 12/18/1989
339751 347826 STANLEY AND SON LIMITED - KENYA Boran -254 -7698 1/15/2011 12/18/1989
170557 597358 NA Holstein Friesian -241 -7320 04/01/2020 3/17/2000
170506 514950 NA Local Zebu -240 -7272 05/04/2017 06/06/1997
170315 598316 NA Ayrshire -238 -7223 5/18/2018 08/08/1998
171861 514852 NA NA -237 -7202 03/07/2017 6/18/1997
171864 514855 NA NA -227 -6892 03/08/2017 4/25/1998
171859 514850 NA NA -226 -6858 03/07/2017 5/28/1998
174152 403781 AVCD Holstein Friesian -224 -6790 9/21/2016 2/18/1998
171863 514854 NA NA -223 -6771 03/08/2017 8/24/1998
171328 515320 NA Holstein Friesian -216 -6545 4/20/2019 5/19/2001
203420 514952 NA Local Zebu -212 -6432 05/04/2017 9/24/1999
171887 515079 NA Local Zebu -208 -6302 09/11/2019 06/10/2002
179369 598285 NA Holstein Friesian -206 -6235 02/01/2021 01/07/2004
792916 601289 KALRO Ayrshire -206 -6261 12/01/2022 10/10/2005
171888 403011 AVCD Unknown -205 -6223 5/18/2019 05/04/2002
170513 515024 NA Unknown -204 -6175 5/29/2019 07/02/2002
813818 601252 KLBA Holstein Friesian -204 -6193 4/18/2012 05/05/1995
175682 510092 NA Unknown -200 -6051 7/16/2016 12/22/1999
171299 514952 NA Local Zebu -196 -5944 05/04/2017 1/24/2001
743491 601217 KLBA Jersey -185 -5619 09/02/1997 4/15/1982
758995 601397 KLBA Holstein Friesian -185 -5621 11/15/2017 6/26/2002
754668 601289 KALRO Ayrshire -183 -5548 8/13/2019 06/04/2004
786263 601210 KLBA Holstein Friesian -179 -5417 04/04/2007 06/04/1992
801205 601289 KALRO Holstein Friesian -169 -5127 11/28/2022 11/14/2008
170559 616901 NA Ayrshire -166 -5027 10/17/2016 01/12/2003
177867 616942 NA Ayrshire -166 -5020 10/25/2016 1/27/2003
186024 598297 NA Holstein Friesian -156 -4738 05/07/2020 5/18/2007
813828 601252 KLBA Holstein Friesian -148 -4477 02/09/2012 11/07/1999
186411 515711 NA Ayrshire -147 -4468 4/20/2020 1/26/2008
286489 347826 STANLEY AND SON LIMITED - KENYA Holstein Friesian -140 -4252 5/22/1978 9/30/1966
748626 601246 KLBA Ayrshire -135 -4082 03/01/2006 12/27/1994
187716 598291 NA Ayrshire -126 -3817 10/07/2017 4/26/2007
187128 513219 NA Unknown -123 -3732 08/01/2019 5/13/2009
187718 598285 NA Holstein Friesian -114 -3437 02/01/2020 09/04/2010
195487 598285 NA Ayrshire -110 -3315 5/27/2018 4/29/2009

6.3 AFC by Breed

# AFC by breed - with proper filtering
afc_by_breed <- lactation %>%
  filter(parity == 1, !is.na(afc_months), !is.na(main_breed_name),
         afc_months >= 18, afc_months <= 72) %>%  # Clean data only
  group_by(main_breed_name) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months, na.rm = TRUE), 1),
    Max = round(max(afc_months, na.rm = TRUE), 1),
    Mean = round(mean(afc_months, na.rm = TRUE), 2),
    SD = round(sd(afc_months, na.rm = TRUE), 2),
    Median = round(median(afc_months, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  filter(N >= 30) %>%
  arrange(Mean)

kable(afc_by_breed, 
      caption = "AFC by Breed (Cleaned: 18-72 months, N≥30)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
AFC by Breed (Cleaned: 18-72 months, N≥30)
main_breed_name N Min Max Mean SD Median
Scandinavian Red 50 18 38 26.84 4.51 25.5
Jersey crosses 70 18 52 30.11 6.17 29.5
Gir 199 21 72 31.07 7.48 30.0
Fleckvieh 55 21 62 31.49 8.06 30.0
Jersey 1,144 18 72 31.50 10.19 28.0
Holstein-Friesian crosses 1,255 18 72 35.78 7.02 35.0
Ayrshire crosses 88 18 72 36.75 10.89 34.0
Holstein Friesian 15,944 18 72 37.21 11.06 34.0
Sahiwal crosses 237 22 60 38.36 8.24 37.0
Boran 101 19 69 38.67 9.09 37.0
Unknown 43 24 71 40.09 14.00 36.0
Guernsey 574 19 72 41.44 12.62 38.0
Ayrshire 5,188 18 72 43.49 11.60 41.0
Brown Swiss 147 19 72 44.18 13.94 42.0
Sahiwal 2,875 18 72 49.27 11.44 49.0
# Plot
ggplot(afc_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD), width = 0.2, color = "black") +
  geom_text(aes(label = paste0(round(Mean, 1), "\n(n=", N, ")")), 
            hjust = -0.1, size = 3) +
  coord_flip() +
  labs(
    title = "Mean Age at First Calving by Breed",
    subtitle = "Error bars = ±1 SD | Filtered: 18-72 months",
    x = "Breed",
    y = "Mean AFC (months)"
  ) +
  scale_y_continuous(limits = c(0, max(afc_by_breed$Mean + afc_by_breed$SD, na.rm = TRUE) * 1.3))


7 Daily Milk Yield Issues

7.1 Distribution with Outliers

# Daily yield categories
dmy_data <- milking %>%
  filter(!is.na(daily_yield)) %>%
  mutate(
    yield_category = case_when(
      daily_yield < 0 ~ "Negative (ERROR)",
      daily_yield == 0 ~ "Zero Yield",
      daily_yield > 0 & daily_yield < 0.5 ~ "Very Low (<0.5 kg)",
      daily_yield >= 0.5 & daily_yield <= 30 ~ "Normal (0.5-30 kg)",
      daily_yield > 30 & daily_yield <= 50 ~ "High (30-50 kg)",
      daily_yield > 50 ~ "Extreme (>50 kg)"
    )
  )

# Summary with Min, Max, Mean
dmy_summary <- dmy_data %>%
  group_by(yield_category) %>%
  summarise(
    N = n(),
    Min = round(min(daily_yield, na.rm = TRUE), 2),
    Max = round(max(daily_yield, na.rm = TRUE), 2),
    Mean = round(mean(daily_yield, na.rm = TRUE), 2),
    SD = round(sd(daily_yield, na.rm = TRUE), 2),
    Median = round(median(daily_yield, na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(dmy_summary, 
      caption = "Daily Milk Yield Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Daily Milk Yield Categories with Min/Max/Mean
yield_category N Min Max Mean SD Median Percentage
High (30-50 kg) 1,936 30.10 50.00 32.68 2.83 31.9 0.18
Normal (0.5-30 kg) 673,257 0.50 30.00 8.47 6.96 8.1 64.21
Very Low (<0.5 kg) 40,284 0.01 0.49 0.28 0.12 0.3 3.84
Zero Yield 333,069 0.00 0.00 0.00 0.00 0.0 31.76
# Plot 1: Histogram
p1 <- ggplot(dmy_data, aes(x = daily_yield)) +
  geom_histogram(aes(fill = yield_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Zero Yield" = "gray40",
    "Very Low (<0.5 kg)" = "orange",
    "Normal (0.5-30 kg)" = "darkgreen",
    "High (30-50 kg)" = "blue",
    "Extreme (>50 kg)" = "purple"
  )) +
  labs(
    title = "Daily Milk Yield - Full Distribution",
    subtitle = paste0("Min = ", round(min(dmy_data$daily_yield), 2), " | Max = ", 
                      round(max(dmy_data$daily_yield), 2), " | Mean = ", 
                      round(mean(dmy_data$daily_yield), 2), " kg"),
    x = "Daily Yield (kg)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

# Plot 2: Focus on normal range with cleaned mean
clean_dmy <- dmy_data %>% filter(daily_yield >= 0.5, daily_yield <= 50)
p2 <- ggplot(clean_dmy, aes(x = daily_yield)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black", linewidth = 0.1) +
  geom_vline(xintercept = mean(clean_dmy$daily_yield), 
             linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(clean_dmy$daily_yield), y = Inf, 
           label = paste0("Mean = ", round(mean(clean_dmy$daily_yield), 2), " kg"),
           vjust = 2, hjust = -0.1, color = "red", size = 4) +
  labs(
    title = "Daily Milk Yield - Cleaned (0.5-50 kg)",
    subtitle = paste0("N = ", format(nrow(clean_dmy), big.mark = ","), 
                      " | Mean = ", round(mean(clean_dmy$daily_yield), 2),
                      " | SD = ", round(sd(clean_dmy$daily_yield), 2), " kg"),
    x = "Daily Yield (kg)",
    y = "Count"
  ) +
  scale_y_continuous(labels = comma)

# Plot 3: Boxplot
p3 <- ggplot(dmy_data, aes(y = daily_yield)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.size = 0.5) +
  labs(title = "Daily Yield Boxplot", y = "Daily Yield (kg)") +
  coord_flip()

grid.arrange(p1, p2, p3, ncol = 1, heights = c(2, 2, 1))

7.2 Daily Yield by Breed

# Daily yield by breed - cleaned
dmy_by_breed <- milking %>%
  filter(!is.na(daily_yield), !is.na(main_breed_name),
         daily_yield >= 0.5, daily_yield <= 50) %>%
  group_by(main_breed_name) %>%
  summarise(
    N_Records = n(),
    N_Animals = n_distinct(animal_id),
    Min = round(min(daily_yield, na.rm = TRUE), 2),
    Max = round(max(daily_yield, na.rm = TRUE), 2),
    Mean = round(mean(daily_yield, na.rm = TRUE), 2),
    SD = round(sd(daily_yield, na.rm = TRUE), 2),
    Median = round(median(daily_yield, na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  filter(N_Records >= 500) %>%
  arrange(desc(Mean))

kable(dmy_by_breed, 
      caption = "Daily Milk Yield by Breed (Cleaned: 0.5-50 kg, N≥500)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Daily Milk Yield by Breed (Cleaned: 0.5-50 kg, N≥500)
main_breed_name N_Records N_Animals Min Max Mean SD Median
Jersey crosses 25,699 55 0.5 31.4 14.61 4.97 14.20
Gir 82,442 184 0.5 39.5 14.14 5.28 13.60
Ayrshire crosses 16,660 41 0.5 34.0 13.90 4.83 13.70
Sahiwal crosses 24,438 41 1.0 37.1 13.22 5.23 12.65
Holstein-Friesian crosses 69,743 394 0.8 37.5 13.19 5.40 12.20
Scandinavian Red 23,510 49 1.0 34.0 12.84 4.50 12.30
Brown Swiss 21,789 126 0.5 36.0 12.54 6.59 12.30
Girolando 12,472 22 1.4 31.1 12.30 4.53 12.30
Fleckvieh 16,331 46 0.5 28.9 12.05 4.54 11.60
Boran 3,280 39 0.5 25.2 10.44 4.39 9.60
Holstein Friesian 250,367 9,107 0.5 50.0 5.56 6.38 2.00
Jersey 14,583 537 0.5 30.0 5.12 6.01 1.42
Sahiwal 26,437 1,422 0.5 30.0 3.26 3.69 2.00
Ayrshire 74,726 3,557 0.5 36.3 2.83 3.94 1.37
Guernsey 8,131 419 0.5 27.6 1.74 2.45 0.95
# Plot
ggplot(dmy_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
  geom_bar(stat = "identity", fill = "darkgreen", width = 0.7) +
  geom_errorbar(aes(ymin = Mean - SD/sqrt(N_Records)*1.96, 
                    ymax = Mean + SD/sqrt(N_Records)*1.96), 
                width = 0.2, color = "black") +
  geom_text(aes(label = paste0(round(Mean, 1), " kg")), 
            hjust = -0.1, size = 3) +
  coord_flip() +
  labs(
    title = "Mean Daily Milk Yield by Breed",
    subtitle = "Error bars = 95% CI | Filtered: 0.5-50 kg",
    x = "Breed",
    y = "Mean Daily Yield (kg)"
  ) +
  scale_y_continuous(limits = c(0, max(dmy_by_breed$Mean, na.rm = TRUE) * 1.3))


8 Days in Milk (DIM) Issues

8.1 Distribution

# DIM categories
dim_data <- milking %>%
  filter(!is.na(dim)) %>%
  mutate(
    dim_category = case_when(
      dim < 0 ~ "Negative (ERROR)",
      dim == 0 ~ "Zero DIM",
      dim >= 1 & dim <= 5 ~ "Very Early (1-5 d)",
      dim > 5 & dim <= 305 ~ "Normal (5-305 d)",
      dim > 305 & dim <= 500 ~ "Extended (305-500 d)",
      dim > 500 & dim <= 1000 ~ "Very Extended (500-1000 d)",
      dim > 1000 ~ "Extreme (>1000 d)"
    )
  )

# Summary
dim_summary <- dim_data %>%
  group_by(dim_category) %>%
  summarise(
    N = n(),
    Min = min(dim, na.rm = TRUE),
    Max = max(dim, na.rm = TRUE),
    Mean = round(mean(dim, na.rm = TRUE), 1),
    SD = round(sd(dim, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(dim_summary, 
      caption = "Days in Milk Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Days in Milk Categories with Min/Max/Mean
dim_category N Min Max Mean SD Percentage
Extended (305-500 d) 117,863 306 500 376.9 55.0 11.24
Extreme (>1000 d) 22,267 1,001 7,752 1,596.5 718.3 2.12
Negative (ERROR) 10 -729,939 -298,437 -633,728.4 119,639.4 0.00
Normal (5-305 d) 828,097 6 305 142.5 83.9 78.97
Very Early (1-5 d) 19,458 1 5 4.3 1.1 1.86
Very Extended (500-1000 d) 57,618 501 1,000 682.7 138.3 5.49
Zero DIM 3,262 0 0 0.0 0.0 0.31
# Plot
p1 <- ggplot(dim_data, aes(x = dim)) +
  geom_histogram(aes(fill = dim_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Zero DIM" = "gray40",
    "Very Early (1-5 d)" = "orange",
    "Normal (5-305 d)" = "darkgreen",
    "Extended (305-500 d)" = "yellow",
    "Very Extended (500-1000 d)" = "purple",
    "Extreme (>1000 d)" = "black"
  )) +
  labs(
    title = "Days in Milk - Full Distribution",
    subtitle = paste0("Min = ", format(min(dim_data$dim), big.mark = ","), 
                      " | Max = ", format(max(dim_data$dim), big.mark = ","),
                      " (Note extreme negative!)"),
    x = "DIM (days)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

print(p1)


9 Total Lactation Yield Issues

9.1 Distribution

# TLY data
tly_data <- lactation %>%
  filter(!is.na(total_lactation_yield)) %>%
  mutate(
    tly_category = case_when(
      total_lactation_yield < 0 ~ "Negative (ERROR)",
      total_lactation_yield == 0 ~ "Zero Yield",
      total_lactation_yield > 0 & total_lactation_yield < 100 ~ "Very Low (<100 kg)",
      total_lactation_yield >= 100 & total_lactation_yield <= 5000 ~ "Normal (100-5000 kg)",
      total_lactation_yield > 5000 & total_lactation_yield <= 10000 ~ "High (5000-10000 kg)",
      total_lactation_yield > 10000 & total_lactation_yield <= 15000 ~ "Very High (10-15k kg)",
      total_lactation_yield > 15000 ~ "Extreme (>15000 kg)"
    )
  )

# Summary
tly_summary <- tly_data %>%
  group_by(tly_category) %>%
  summarise(
    N = n(),
    Min = round(min(total_lactation_yield, na.rm = TRUE), 1),
    Max = round(max(total_lactation_yield, na.rm = TRUE), 1),
    Mean = round(mean(total_lactation_yield, na.rm = TRUE), 1),
    SD = round(sd(total_lactation_yield, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(tly_summary, 
      caption = "Total Lactation Yield Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Total Lactation Yield Categories with Min/Max/Mean
tly_category N Min Max Mean SD Percentage
Extreme (>15000 kg) 9 15,205.2 23,186.4 18,090.9 2,309.1 0.01
High (5000-10000 kg) 361 5,004.2 9,979.2 6,615.5 1,290.3 0.41
Normal (100-5000 kg) 9,199 100.0 4,999.0 694.1 953.7 10.41
Very High (10-15k kg) 39 10,000.7 14,973.5 11,598.2 1,589.4 0.04
Very Low (<100 kg) 27,585 0.0 99.9 18.5 19.0 31.22
Zero Yield 51,154 0.0 0.0 0.0 0.0 57.90
# Major issue: Many zero yields!
zero_tly_pct <- round(100 * sum(tly_data$total_lactation_yield == 0) / nrow(tly_data), 1)

# Plot
p1 <- ggplot(tly_data, aes(x = total_lactation_yield)) +
  geom_histogram(aes(fill = tly_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Zero Yield" = "gray40",
    "Very Low (<100 kg)" = "orange",
    "Normal (100-5000 kg)" = "darkgreen",
    "High (5000-10000 kg)" = "blue",
    "Very High (10-15k kg)" = "purple",
    "Extreme (>15000 kg)" = "black"
  )) +
  labs(
    title = "Total Lactation Yield - Full Distribution",
    subtitle = paste0("WARNING: ", zero_tly_pct, "% of records have ZERO yield!"),
    x = "Total Lactation Yield (kg)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

# Non-zero distribution with mean
tly_nonzero <- tly_data %>% filter(total_lactation_yield > 0, total_lactation_yield <= 15000)
p2 <- ggplot(tly_nonzero, aes(x = total_lactation_yield)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black") +
  geom_vline(xintercept = mean(tly_nonzero$total_lactation_yield), 
             linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(tly_nonzero$total_lactation_yield), y = Inf,
           label = paste0("Mean = ", round(mean(tly_nonzero$total_lactation_yield), 1), " kg"),
           vjust = 2, hjust = -0.1, color = "red") +
  labs(
    title = "Total Lactation Yield - Non-Zero Values Only",
    subtitle = paste0("N = ", format(nrow(tly_nonzero), big.mark = ","),
                      " | Mean = ", round(mean(tly_nonzero$total_lactation_yield), 1),
                      " | SD = ", round(sd(tly_nonzero$total_lactation_yield), 1), " kg"),
    x = "Total Lactation Yield (kg)",
    y = "Count"
  )

grid.arrange(p1, p2, ncol = 1)


10 Calving Interval Issues

10.1 Distribution

# CI data (parity > 1)
ci_data <- lactation %>%
  filter(parity > 1, !is.na(calving_interval_days)) %>%
  mutate(
    ci_category = case_when(
      calving_interval_days < 0 ~ "Negative (ERROR)",
      calving_interval_days >= 0 & calving_interval_days < 300 ~ "Too Short (<300 d)",
      calving_interval_days >= 300 & calving_interval_days <= 450 ~ "Optimal (300-450 d)",
      calving_interval_days > 450 & calving_interval_days <= 600 ~ "Long (450-600 d)",
      calving_interval_days > 600 & calving_interval_days <= 730 ~ "Very Long (600-730 d)",
      calving_interval_days > 730 ~ "Extreme (>730 d / 2 yrs)"
    )
  )

# Summary
ci_summary <- ci_data %>%
  group_by(ci_category) %>%
  summarise(
    N = n(),
    Min = round(min(calving_interval_days, na.rm = TRUE), 1),
    Max = round(max(calving_interval_days, na.rm = TRUE), 1),
    Mean = round(mean(calving_interval_days, na.rm = TRUE), 1),
    SD = round(sd(calving_interval_days, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(ci_summary, 
      caption = "Calving Interval Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Calving Interval Categories with Min/Max/Mean
ci_category N Min Max Mean SD Percentage
Extreme (>730 d / 2 yrs) 6,784 731 16,530 1,135.2 595.6 13.44
Long (450-600 d) 11,011 451 600 512.1 41.8 21.82
Optimal (300-450 d) 27,179 300 450 377.6 36.5 53.86
Too Short (<300 d) 1,761 0 299 151.7 108.3 3.49
Very Long (600-730 d) 3,728 601 730 659.3 37.6 7.39
# Plots
p1 <- ggplot(ci_data, aes(x = calving_interval_days)) +
  geom_histogram(aes(fill = ci_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Too Short (<300 d)" = "orange",
    "Optimal (300-450 d)" = "darkgreen",
    "Long (450-600 d)" = "yellow",
    "Very Long (600-730 d)" = "purple",
    "Extreme (>730 d / 2 yrs)" = "black"
  )) +
  labs(
    title = "Calving Interval - Full Distribution",
    subtitle = paste0("Min = ", round(min(ci_data$calving_interval_days), 1), 
                      " | Max = ", format(round(max(ci_data$calving_interval_days), 1), big.mark = ","),
                      " | Mean = ", round(mean(ci_data$calving_interval_days), 1), " days"),
    x = "Calving Interval (days)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

print(p1)


11 Days Open Issues

11.1 Distribution

# Days open data
do_data <- lactation %>%
  filter(!is.na(days_open)) %>%
  mutate(
    do_category = case_when(
      days_open < 0 ~ "Negative (ERROR)",
      days_open >= 0 & days_open < 21 ~ "Too Short (<21 d)",
      days_open >= 21 & days_open <= 120 ~ "Optimal (21-120 d)",
      days_open > 120 & days_open <= 200 ~ "Acceptable (120-200 d)",
      days_open > 200 & days_open <= 365 ~ "Long (200-365 d)",
      days_open > 365 ~ "Extreme (>365 d)"
    )
  )

# Summary
do_summary <- do_data %>%
  group_by(do_category) %>%
  summarise(
    N = n(),
    Min = round(min(days_open, na.rm = TRUE), 1),
    Max = round(max(days_open, na.rm = TRUE), 1),
    Mean = round(mean(days_open, na.rm = TRUE), 1),
    SD = round(sd(days_open, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(do_summary, 
      caption = "Days Open Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Days Open Categories with Min/Max/Mean
do_category N Min Max Mean SD Percentage
Acceptable (120-200 d) 11,082 121 200 155.7 22.8 21.85
Extreme (>365 d) 9,053 366 669,062 821.1 7,046.7 17.85
Long (200-365 d) 9,364 201 365 266.3 45.8 18.47
Negative (ERROR) 1,531 -280 -1 -159.2 97.2 3.02
Optimal (21-120 d) 19,353 21 120 79.0 24.0 38.16
Too Short (<21 d) 327 0 20 11.6 6.0 0.64
# Plot
p1 <- ggplot(do_data, aes(x = days_open)) +
  geom_histogram(aes(fill = do_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Too Short (<21 d)" = "orange",
    "Optimal (21-120 d)" = "darkgreen",
    "Acceptable (120-200 d)" = "lightgreen",
    "Long (200-365 d)" = "yellow",
    "Extreme (>365 d)" = "purple"
  )) +
  labs(
    title = "Days Open - Full Distribution",
    subtitle = paste0("Min = ", round(min(do_data$days_open), 1), " (NEGATIVE!) | Max = ", 
                      format(round(max(do_data$days_open), 1), big.mark = ",")),
    x = "Days Open",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  scale_x_continuous(limits = c(-500, 2000)) +
  theme(legend.position = "bottom")

print(p1)


12 Combined Outlier Scatter Plots

12.1 AFC vs Daily Yield

# Merge data for scatter plots
scatter_data <- milking %>%
  filter(!is.na(afc_months), !is.na(daily_yield)) %>%
  filter(afc_months > -500, afc_months < 500, daily_yield >= 0, daily_yield <= 60)

# Sample if too large
if(nrow(scatter_data) > 50000) {
  scatter_data <- scatter_data %>% sample_n(50000)
}

ggplot(scatter_data, aes(x = afc_months, y = daily_yield)) +
  geom_point(alpha = 0.3, size = 0.5, color = "steelblue") +
  geom_vline(xintercept = c(0, 18, 72), linetype = "dashed", color = "red") +
  geom_hline(yintercept = c(0, 50), linetype = "dashed", color = "red") +
  annotate("rect", xmin = 18, xmax = 72, ymin = 0.5, ymax = 50, 
           alpha = 0.1, fill = "green") +
  labs(
    title = "AFC vs Daily Milk Yield - Outlier Detection",
    subtitle = "Green zone = expected normal range; Red lines = thresholds",
    x = "Age at First Calving (months)",
    y = "Daily Milk Yield (kg)"
  )

12.2 DIM vs Daily Yield

scatter_dim <- milking %>%
  filter(!is.na(dim), !is.na(daily_yield)) %>%
  filter(dim > -1000, dim < 1000, daily_yield >= 0, daily_yield <= 60)

# Sample if too large
if(nrow(scatter_dim) > 50000) {
  scatter_dim <- scatter_dim %>% sample_n(50000)
}

ggplot(scatter_dim, aes(x = dim, y = daily_yield)) +
  geom_point(alpha = 0.3, size = 0.5, color = "darkgreen") +
  geom_vline(xintercept = c(0, 305, 500), linetype = "dashed", color = "red") +
  geom_hline(yintercept = c(0, 50), linetype = "dashed", color = "red") +
  annotate("rect", xmin = 5, xmax = 305, ymin = 0.5, ymax = 40, 
           alpha = 0.1, fill = "green") +
  labs(
    title = "Days in Milk vs Daily Yield - Outlier Detection",
    subtitle = "Green zone = expected normal range",
    x = "Days in Milk (DIM)",
    y = "Daily Milk Yield (kg)"
  )


13 Summary and Recommendations

13.1 Data Quality Summary

# Create final summary table with proper NA handling
quality_summary <- data.frame(
  Trait = c("Age at First Calving", "Calving Interval", "Total Lactation Yield",
            "Daily Milk Yield", "Days in Milk", "Days Open", "Parity"),
  
  N_Total = c(
    sum(!is.na(lactation$afc_months) & lactation$parity == 1),
    sum(!is.na(lactation$calving_interval_days) & lactation$parity > 1),
    sum(!is.na(lactation$total_lactation_yield)),
    sum(!is.na(milking$daily_yield)),
    sum(!is.na(milking$dim)),
    sum(!is.na(lactation$days_open)),
    sum(!is.na(lactation$parity))
  ),
  
  Min_Value = c(
    round(min(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(min(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(min(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(min(milking$daily_yield, na.rm = TRUE), 2),
    round(min(milking$dim, na.rm = TRUE), 0),
    round(min(lactation$days_open, na.rm = TRUE), 1),
    round(min(lactation$parity, na.rm = TRUE), 0)
  ),
  
  Max_Value = c(
    round(max(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(max(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(max(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(max(milking$daily_yield, na.rm = TRUE), 2),
    round(max(milking$dim, na.rm = TRUE), 0),
    round(max(lactation$days_open, na.rm = TRUE), 1),
    round(max(lactation$parity, na.rm = TRUE), 0)
  ),
  
  Raw_Mean = c(
    round(mean(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 2),
    round(mean(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 2),
    round(mean(lactation$total_lactation_yield, na.rm = TRUE), 2),
    round(mean(milking$daily_yield, na.rm = TRUE), 2),
    round(mean(milking$dim, na.rm = TRUE), 2),
    round(mean(lactation$days_open, na.rm = TRUE), 2),
    round(mean(lactation$parity, na.rm = TRUE), 2)
  ),
  
  Recommended_Range = c(
    "18-72 months",
    "300-700 days",
    "100-15,000 kg",
    "0.5-50 kg",
    "5-500 days",
    "21-400 days",
    "1-12"
  ),
  
  Problem_Count = c(
    sum((lactation$afc_months < 0 | lactation$afc_months > 100) & lactation$parity == 1, na.rm = TRUE),
    sum((lactation$calving_interval_days < 280 | lactation$calving_interval_days > 1000) & lactation$parity > 1, na.rm = TRUE),
    sum(lactation$total_lactation_yield == 0 | lactation$total_lactation_yield > 20000, na.rm = TRUE),
    sum(milking$daily_yield <= 0 | milking$daily_yield > 50, na.rm = TRUE),
    sum(milking$dim < 0 | milking$dim > 1000, na.rm = TRUE),
    sum(lactation$days_open < 0 | lactation$days_open > 500, na.rm = TRUE),
    sum(lactation$parity < 1 | lactation$parity > 15, na.rm = TRUE)
  )
)

# Calculate percentages
quality_summary <- quality_summary %>%
  mutate(
    Problem_Pct = round(100 * Problem_Count / N_Total, 2),
    Quality_Status = case_when(
      Problem_Pct < 1 ~ "Good",
      Problem_Pct < 5 ~ "Minor Issues",
      Problem_Pct < 20 ~ "Moderate Issues",
      TRUE ~ "Major Issues"
    )
  )

kable(quality_summary, 
      caption = "Data Quality Summary with Min/Max Values",
      col.names = c("Trait", "N Total", "Min", "Max", "Raw Mean", 
                    "Recommended Range", "Problem N", "Problem %", "Status"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
Data Quality Summary with Min/Max Values
Trait N Total Min Max Raw Mean Recommended Range Problem N Problem % Status
Age at First Calving NA -394 24,295.0 50.96 18-72 months 1,608 NA Major Issues
Calving Interval NA 0 16,530.0 521.70 300-700 days 4,529 NA Major Issues
Total Lactation Yield 88,347 0 23,186.4 112.02 100-15,000 kg 51,155 57.90 Major Issues
Daily Milk Yield 1,048,546 0 50.0 5.51 0.5-50 kg 333,069 31.76 Major Issues
Days in Milk 1,048,575 -729,939 7,752.0 220.35 5-500 days 22,277 2.12 Minor Issues
Days Open 50,710 -280 669,062.0 255.21 21-400 days 7,307 14.41 Moderate Issues
Parity 87,990 1 32.0 2.42 1-12 39 0.04 Good

13.2 Key Findings

Summary of Outliers Detected:

Trait Min Value Max Value Issue
AFC -394 months 24,295 months Negative values & extreme highs
DIM -729,939 days 7,752 days Extreme negative values
Days Open -280 days 669,062 days Negative values
Total Lact. Yield 0 kg 23,186.4 kg 57.9% zeros

13.3 Suggested Data Cleaning Filters

# Lactation data cleaning
lactation_clean <- lactation %>%
  filter(
    is.na(afc_months) | (afc_months >= 18 & afc_months <= 72),
    is.na(calving_interval_days) | (calving_interval_days >= 300 & calving_interval_days <= 700),
    total_lactation_yield > 0 & total_lactation_yield <= 15000,
    is.na(days_open) | (days_open >= 21 & days_open <= 400),
    parity >= 1 & parity <= 12
  )

# Milking data cleaning
milking_clean <- milking %>%
  filter(
    dim >= 5 & dim <= 500,
    daily_yield >= 0.5 & daily_yield <= 50,
    is.na(afc_months) | (afc_months >= 18 & afc_months <= 72),
    is.na(parity) | (parity >= 1 & parity <= 12)
  )

14 Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_Kenya.utf8  LC_CTYPE=English_Kenya.utf8   
## [3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=English_Kenya.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5     viridisLite_0.4.2 kableExtra_1.4.0  knitr_1.50       
##  [5] gridExtra_2.3     scales_1.4.0      ggplot2_3.5.2     tidyr_1.3.1      
##  [9] dplyr_1.1.4       data.table_1.17.8
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] xml2_1.4.0         stringr_1.5.1      jquerylib_0.1.4    textshaping_1.0.2 
##  [9] systemfonts_1.2.3  yaml_2.3.10        fastmap_1.2.0      R6_2.6.1          
## [13] labeling_0.4.3     generics_0.1.4     tibble_3.3.0       svglite_2.2.1     
## [17] bslib_0.9.0        pillar_1.11.0      RColorBrewer_1.1-3 rlang_1.1.6       
## [21] stringi_1.8.7      cachem_1.1.0       xfun_0.52          sass_0.4.10       
## [25] cli_3.6.5          withr_3.0.2        magrittr_2.0.3     digest_0.6.37     
## [29] grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
## [37] purrr_1.1.0        tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1

Report generated on 05 December 2025 at 04:48