Executive Summary

This report provides a comprehensive data quality assessment for the Kenya ADGG/TDGG dairy datasets. It identifies:

Outliers in key traits (AFC, milk yield, calving interval, etc.)
Impossible/implausible values (negative ages, extreme yields)
Data entry errors (dates in wrong fields, miscoded values)
Missing data patterns
Recommendations for data cleaning

1 Data Loading

# SET YOUR FILE PATH HERE
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/"

# Read data - treating "NULL" as NA
lactation <- fread(paste0(data_path, "lactation_calving.csv"), 
                   na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                   stringsAsFactors = FALSE)

milking <- fread(paste0(data_path, "milking_data.csv"), 
                 na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                 stringsAsFactors = FALSE)

# Convert numeric columns
lactation <- lactation %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    calving_interval_days = as.numeric(calving_interval_days),
    calving_interval_months = as.numeric(calving_interval_months),
    lactation_length_days = as.numeric(lactation_length_days),
    total_lactation_yield = as.numeric(total_lactation_yield),
    testday_count = as.numeric(testday_count),
    days_open = as.numeric(days_open)
  )

milking <- milking %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    dim = as.numeric(dim),
    daily_yield = as.numeric(daily_yield),
    milk_morning = as.numeric(milk_morning),
    milk_evening = as.numeric(milk_evening),
    milk_midday = as.numeric(milk_midday),
    testday_no = as.numeric(testday_no),
    calving_age_months = as.numeric(calving_age_months)
  )

cat("Lactation data:", format(nrow(lactation), big.mark = ","), "records\n")

## Lactation data: 88,352 records

cat("Milking data:", format(nrow(milking), big.mark = ","), "records\n")

## Milking data: 1,048,575 records

2 Overall Descriptive Statistics (Raw Data)

This section shows the raw statistics including outliers to highlight data quality issues.

# Function to calculate comprehensive stats with NA handling
calc_stats <- function(x, var_name) {
  x_valid <- x[!is.na(x)]
  n_total <- length(x)
  n_valid <- length(x_valid)
  n_missing <- sum(is.na(x))
  
  if (n_valid == 0) {
    return(data.frame(
      Variable = var_name,
      N_Total = n_total,
      N_Valid = 0,
      N_Missing = n_missing,
      Pct_Missing = 100,
      Min = NA, Max = NA, Mean = NA, SD = NA, Median = NA
    ))
  }
  
  data.frame(
    Variable = var_name,
    N_Total = n_total,
    N_Valid = n_valid,
    N_Missing = n_missing,
    Pct_Missing = round(100 * n_missing / n_total, 2),
    Min = round(min(x_valid), 2),
    Max = round(max(x_valid), 2),
    Mean = round(mean(x_valid), 2),
    SD = round(sd(x_valid), 2),
    Median = round(median(x_valid), 2)
  )
}

# Lactation traits - RAW (including outliers)
lact_stats_raw <- rbind(
  calc_stats(lactation$afc_months[lactation$parity == 1], "AFC (months) - Parity 1"),
  calc_stats(lactation$afc_days[lactation$parity == 1], "AFC (days) - Parity 1"),
  calc_stats(lactation$calving_interval_days[lactation$parity > 1], "Calving Interval (days)"),
  calc_stats(lactation$lactation_length_days, "Lactation Length (days)"),
  calc_stats(lactation$total_lactation_yield, "Total Lactation Yield (kg)"),
  calc_stats(lactation$days_open, "Days Open"),
  calc_stats(lactation$parity, "Parity"),
  calc_stats(lactation$testday_count, "Test-day Count")
)

kable(lact_stats_raw, 
      caption = "Lactation Data - RAW Statistics (Including Outliers)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = "red", bold = TRUE) %>%
  column_spec(7, color = "red", bold = TRUE)

Lactation Data - RAW Statistics (Including Outliers)
Variable	N_Total	N_Valid	N_Missing	Pct_Missing	Min	Max	Mean	SD	Median
AFC (months) - Parity 1	37,598	33,015	4,583	12.19	-394	24,295.0	50.96	364.55	38
AFC (days) - Parity 1	37,598	33,015	4,583	12.19	-11,956	738,588.0	1,563.98	11,082.36	1,158
Calving Interval (days)	51,116	50,463	653	1.28	0	16,530.0	521.70	342.46	424
Lactation Length (days)	88,352	50,707	37,645	42.61	0	669,342.0	535.21	2,990.38	424
Total Lactation Yield (kg)	88,352	88,347	5	0.01	0	23,186.4	112.02	642.21	0
Days Open	88,352	50,710	37,642	42.60	-280	669,062.0	255.21	2,990.29	144
Parity	88,352	87,990	362	0.41	1	32.0	2.42	1.82	2
Test-day Count	88,352	88,352	0	0.00	0	2,362.0	19.61	68.67	0

⚠️ KEY OUTLIERS VISIBLE IN MIN/MAX:

AFC: Min = -394 months (NEGATIVE!), Max = 24,295 months
Calving Interval: Min = 0 days, Max = 16,530 days
Days Open: Min = -280 days (NEGATIVE!), Max = 669,062 days

# Milking traits - RAW
milk_stats_raw <- rbind(
  calc_stats(milking$daily_yield, "Daily Milk Yield (kg)"),
  calc_stats(milking$milk_morning, "Morning Milk (kg)"),
  calc_stats(milking$milk_evening, "Evening Milk (kg)"),
  calc_stats(milking$dim, "Days in Milk (DIM)"),
  calc_stats(milking$parity, "Parity"),
  calc_stats(milking$afc_months, "AFC (months)"),
  calc_stats(milking$testday_no, "Test-day Number")
)

kable(milk_stats_raw, 
      caption = "Milking Data - RAW Statistics (Including Outliers)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = "red", bold = TRUE) %>%
  column_spec(7, color = "red", bold = TRUE)

Milking Data - RAW Statistics (Including Outliers)
Variable	N_Total	N_Valid	N_Missing	Pct_Missing	Min	Max	Mean	SD	Median
Daily Milk Yield (kg)	1,048,575	1,048,546	29	0.00	0	50	5.51	6.99	1.38
Morning Milk (kg)	1,048,575	1,048,415	160	0.02	0	459	1.90	3.84	0.00
Evening Milk (kg)	1,048,575	1,048,413	162	0.02	-1	29	1.81	3.18	0.00
Days in Milk (DIM)	1,048,575	1,048,575	0	0.00	-729,939	7,752	220.35	2,008.49	168.00
Parity	1,048,575	1,048,403	172	0.02	1	32	2.66	1.99	2.00
AFC (months)	1,048,575	389,867	658,708	62.82	-288	270	38.29	19.43	33.00
Test-day Number	1,048,575	1,017,335	31,240	2.98	1	1,514	63.66	123.64	18.00

⚠️ KEY OUTLIERS VISIBLE IN MIN/MAX:

DIM: Min = -729,939 days (EXTREME NEGATIVE!), Max = 7,752 days
Evening Milk: Min = -1 kg (NEGATIVE!)
AFC in milking: Min = -288 months (NEGATIVE!)

3 Cleaned Statistics (After Removing Outliers)

Comparing raw vs cleaned statistics to show the impact of outliers.

# Define cleaning thresholds
# AFC: 18-72 months
# Calving Interval: 300-700 days
# Lactation Length: 60-500 days
# Total Lactation Yield: > 0 and <= 15000 kg
# Days Open: 21-400 days
# Parity: 1-12
# Daily Yield: 0.5-50 kg
# DIM: 5-500 days

# Lactation - CLEANED stats
lact_stats_clean <- rbind(
  calc_stats(lactation$afc_months[lactation$parity == 1 & 
                                   lactation$afc_months >= 18 & 
                                   lactation$afc_months <= 72], 
             "AFC (months) - CLEANED"),
  calc_stats(lactation$calving_interval_days[lactation$parity > 1 & 
                                              lactation$calving_interval_days >= 300 & 
                                              lactation$calving_interval_days <= 700], 
             "Calving Interval (days) - CLEANED"),
  calc_stats(lactation$lactation_length_days[lactation$lactation_length_days >= 60 & 
                                              lactation$lactation_length_days <= 500], 
             "Lactation Length (days) - CLEANED"),
  calc_stats(lactation$total_lactation_yield[lactation$total_lactation_yield > 0 & 
                                              lactation$total_lactation_yield <= 15000], 
             "Total Lactation Yield (kg) - CLEANED"),
  calc_stats(lactation$days_open[lactation$days_open >= 21 & 
                                  lactation$days_open <= 400], 
             "Days Open - CLEANED"),
  calc_stats(lactation$parity[lactation$parity >= 1 & lactation$parity <= 12], 
             "Parity - CLEANED")
)

kable(lact_stats_clean, 
      caption = "Lactation Data - CLEANED Statistics (Outliers Removed)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

Lactation Data - CLEANED Statistics (Outliers Removed)
Variable	N_Total	N_Valid	N_Missing	Pct_Missing	Min	Max	Mean	SD	Median
AFC (months) - CLEANED	32,804	28,229	4,575	13.95	18.00	72.0	39.36	11.89	36
Calving Interval (days) - CLEANED	41,843	41,248	595	1.42	300.00	700.0	433.45	92.15	408
Lactation Length (days) - CLEANED	71,218	33,573	37,645	52.86	60.00	500.0	386.08	60.86	383
Total Lactation Yield (kg) - CLEANED	37,189	37,184	5	0.01	0.02	14,973.5	261.79	927.82	19
Days Open - CLEANED	78,444	40,802	37,642	47.99	21.00	400.0	150.28	87.90	126
Parity - CLEANED	88,289	87,927	362	0.41	1.00	12.0	2.41	1.76	2

# Milking - CLEANED stats
milk_stats_clean <- rbind(
  calc_stats(milking$daily_yield[milking$daily_yield >= 0.5 & milking$daily_yield <= 50], 
             "Daily Milk Yield (kg) - CLEANED"),
  calc_stats(milking$milk_morning[milking$milk_morning > 0 & milking$milk_morning <= 30], 
             "Morning Milk (kg) - CLEANED"),
  calc_stats(milking$milk_evening[milking$milk_evening > 0 & milking$milk_evening <= 20], 
             "Evening Milk (kg) - CLEANED"),
  calc_stats(milking$dim[milking$dim >= 5 & milking$dim <= 500], 
             "DIM - CLEANED"),
  calc_stats(milking$afc_months[milking$afc_months >= 18 & milking$afc_months <= 72], 
             "AFC (months) - CLEANED")
)

kable(milk_stats_clean, 
      caption = "Milking Data - CLEANED Statistics (Outliers Removed)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

Milking Data - CLEANED Statistics (Outliers Removed)
Variable	N_Total	N_Valid	N_Missing	Pct_Missing	Min	Max	Mean	SD	Median
Daily Milk Yield (kg) - CLEANED	675,222	675,193	29	0.00	0.50	50	8.54	7.08	8.1
Morning Milk (kg) - CLEANED	493,797	493,637	160	0.03	0.01	28	4.00	3.59	2.9
Evening Milk (kg) - CLEANED	453,383	453,221	162	0.04	0.01	20	4.19	3.67	3.4
DIM - CLEANED	957,939	957,939	0	0.00	5.00	500	169.61	112.75	156.0
AFC (months) - CLEANED	1,024,502	365,794	658,708	64.30	18.00	72	35.23	9.72	33.0

4 Comparison: Raw vs Cleaned Statistics

# Create comparison table
comparison <- data.frame(
  Trait = c("AFC (months)", "Calving Interval (days)", "Lactation Length (days)", 
            "Total Lactation Yield (kg)", "Days Open", "Daily Milk Yield (kg)", "DIM"),
  
  Raw_N = c(
    sum(!is.na(lactation$afc_months) & lactation$parity == 1),
    sum(!is.na(lactation$calving_interval_days) & lactation$parity > 1),
    sum(!is.na(lactation$lactation_length_days)),
    sum(!is.na(lactation$total_lactation_yield)),
    sum(!is.na(lactation$days_open)),
    sum(!is.na(milking$daily_yield)),
    sum(!is.na(milking$dim))
  ),
  
  Raw_Min = c(
    round(min(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(min(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(min(lactation$lactation_length_days, na.rm = TRUE), 1),
    round(min(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(min(lactation$days_open, na.rm = TRUE), 1),
    round(min(milking$daily_yield, na.rm = TRUE), 2),
    round(min(milking$dim, na.rm = TRUE), 0)
  ),
  
  Raw_Max = c(
    round(max(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(max(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(max(lactation$lactation_length_days, na.rm = TRUE), 1),
    round(max(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(max(lactation$days_open, na.rm = TRUE), 1),
    round(max(milking$daily_yield, na.rm = TRUE), 2),
    round(max(milking$dim, na.rm = TRUE), 0)
  ),
  
  Raw_Mean = c(
    round(mean(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 2),
    round(mean(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 2),
    round(mean(lactation$lactation_length_days, na.rm = TRUE), 2),
    round(mean(lactation$total_lactation_yield, na.rm = TRUE), 2),
    round(mean(lactation$days_open, na.rm = TRUE), 2),
    round(mean(milking$daily_yield, na.rm = TRUE), 2),
    round(mean(milking$dim, na.rm = TRUE), 2)
  ),
  
  Clean_N = c(
    sum(lactation$parity == 1 & lactation$afc_months >= 18 & lactation$afc_months <= 72, na.rm = TRUE),
    sum(lactation$parity > 1 & lactation$calving_interval_days >= 300 & lactation$calving_interval_days <= 700, na.rm = TRUE),
    sum(lactation$lactation_length_days >= 60 & lactation$lactation_length_days <= 500, na.rm = TRUE),
    sum(lactation$total_lactation_yield > 0 & lactation$total_lactation_yield <= 15000, na.rm = TRUE),
    sum(lactation$days_open >= 21 & lactation$days_open <= 400, na.rm = TRUE),
    sum(milking$daily_yield >= 0.5 & milking$daily_yield <= 50, na.rm = TRUE),
    sum(milking$dim >= 5 & milking$dim <= 500, na.rm = TRUE)
  ),
  
  Clean_Mean = c(
    round(mean(lactation$afc_months[lactation$parity == 1 & lactation$afc_months >= 18 & lactation$afc_months <= 72], na.rm = TRUE), 2),
    round(mean(lactation$calving_interval_days[lactation$parity > 1 & lactation$calving_interval_days >= 300 & lactation$calving_interval_days <= 700], na.rm = TRUE), 2),
    round(mean(lactation$lactation_length_days[lactation$lactation_length_days >= 60 & lactation$lactation_length_days <= 500], na.rm = TRUE), 2),
    round(mean(lactation$total_lactation_yield[lactation$total_lactation_yield > 0 & lactation$total_lactation_yield <= 15000], na.rm = TRUE), 2),
    round(mean(lactation$days_open[lactation$days_open >= 21 & lactation$days_open <= 400], na.rm = TRUE), 2),
    round(mean(milking$daily_yield[milking$daily_yield >= 0.5 & milking$daily_yield <= 50], na.rm = TRUE), 2),
    round(mean(milking$dim[milking$dim >= 5 & milking$dim <= 500], na.rm = TRUE), 2)
  ),
  
  Filter_Applied = c(
    "18-72 months",
    "300-700 days",
    "60-500 days",
    ">0 & ≤15,000 kg",
    "21-400 days",
    "0.5-50 kg",
    "5-500 days"
  )
)

comparison <- comparison %>%
  mutate(
    Records_Lost = Raw_N - Clean_N,
    Pct_Lost = round(100 * Records_Lost / Raw_N, 2)
  )

kable(comparison, 
      caption = "Comparison: Raw vs Cleaned Statistics",
      col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", 
                    "Clean N", "Clean Mean", "Filter", "Records Lost", "% Lost"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, color = "red") %>%
  column_spec(4, color = "red") %>%
  column_spec(10, color = "blue", bold = TRUE)

Comparison: Raw vs Cleaned Statistics
Trait	Raw N	Raw Min	Raw Max	Raw Mean	Clean N	Clean Mean	Filter	Records Lost	% Lost
AFC (months)	NA	-394	24,295.0	50.96	28,229	39.36	18-72 months	NA	NA
Calving Interval (days)	NA	0	16,530.0	521.70	41,248	433.45	300-700 days	NA	NA
Lactation Length (days)	50,707	0	669,342.0	535.21	33,573	386.08	60-500 days	17,134	33.79
Total Lactation Yield (kg)	88,347	0	23,186.4	112.02	37,184	261.79	>0 & ≤15,000 kg	51,163	57.91
Days Open	50,710	-280	669,062.0	255.21	40,802	150.28	21-400 days	9,908	19.54
Daily Milk Yield (kg)	1,048,546	0	50.0	5.51	675,193	8.54	0.5-50 kg	373,353	35.61
DIM	1,048,575	-729,939	7,752.0	220.35	957,939	169.61	5-500 days	90,636	8.64

5 Overview of Data Issues

# Create summary of issues
issues_summary <- data.frame(
  Issue_Category = c(
    "Negative AFC values",
    "AFC > 100 months (8+ years)",
    "AFC < 18 months (too young)",
    "Negative calving interval",
    "Calving interval > 1000 days",
    "Negative days in milk",
    "DIM > 1000 days",
    "Daily yield = 0",
    "Daily yield > 50 kg",
    "Negative daily yield",
    "Total lactation yield = 0",
    "Total lactation yield > 20,000 kg",
    "Negative days open",
    "Sex field contains dates",
    "Animal type = 'FEMALE' (misplaced)"
  ),
  Dataset = c(rep("Lactation", 5), rep("Milking", 6), rep("Lactation", 4)),
  Count = c(
    sum(lactation$afc_months < 0, na.rm = TRUE),
    sum(lactation$afc_months > 100, na.rm = TRUE),
    sum(lactation$afc_months > 0 & lactation$afc_months < 18, na.rm = TRUE),
    sum(lactation$calving_interval_days < 0, na.rm = TRUE),
    sum(lactation$calving_interval_days > 1000, na.rm = TRUE),
    sum(milking$dim < 0, na.rm = TRUE),
    sum(milking$dim > 1000, na.rm = TRUE),
    sum(milking$daily_yield == 0, na.rm = TRUE),
    sum(milking$daily_yield > 50, na.rm = TRUE),
    sum(milking$daily_yield < 0, na.rm = TRUE),
    sum(lactation$total_lactation_yield == 0, na.rm = TRUE),
    sum(lactation$total_lactation_yield > 20000, na.rm = TRUE),
    sum(lactation$days_open < 0, na.rm = TRUE),
    sum(grepl("/", lactation$sex), na.rm = TRUE),
    sum(lactation$animal_type == "FEMALE", na.rm = TRUE)
  )
)

issues_summary <- issues_summary %>%
  mutate(
    Percentage = round(100 * Count / ifelse(Dataset == "Lactation", nrow(lactation), nrow(milking)), 3)
  ) %>%
  arrange(desc(Count))

kable(issues_summary, 
      caption = "Summary of Data Quality Issues",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE) %>%
  row_spec(which(issues_summary$Count > 1000), bold = TRUE, color = "red")

Summary of Data Quality Issues
Issue_Category	Dataset	Count	Percentage
Daily yield = 0	Milking	333,069	31.764
Total lactation yield = 0	Milking	51,154	4.878
DIM > 1000 days	Milking	22,267	2.124
Calving interval > 1000 days	Lactation	3,091	3.499
AFC > 100 months (8+ years)	Lactation	2,280	2.581
Negative days open	Lactation	1,531	1.733
AFC < 18 months (too young)	Lactation	723	0.818
Negative AFC values	Lactation	480	0.543
Negative days in milk	Milking	10	0.001
Sex field contains dates	Lactation	8	0.009
Animal type = ‘FEMALE’ (misplaced)	Lactation	8	0.009
Total lactation yield > 20,000 kg	Lactation	1	0.001
Negative calving interval	Lactation	0	0.000
Daily yield > 50 kg	Milking	0	0.000
Negative daily yield	Milking	0	0.000

6 Age at First Calving (AFC) Issues

6.1 Distribution with Outliers

# AFC distribution showing all values including outliers
afc_data <- lactation %>%
  filter(parity == 1, !is.na(afc_months)) %>%
  mutate(
    afc_category = case_when(
      afc_months < 0 ~ "Negative (ERROR)",
      afc_months >= 0 & afc_months < 18 ~ "Too Young (<18 mo)",
      afc_months >= 18 & afc_months <= 72 ~ "Normal (18-72 mo)",
      afc_months > 72 & afc_months <= 100 ~ "Late (72-100 mo)",
      afc_months > 100 ~ "Extreme (>100 mo)"
    )
  )

# Summary table with Min, Max, Mean, SD
afc_summary <- afc_data %>%
  group_by(afc_category) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months, na.rm = TRUE), 1),
    Max = round(max(afc_months, na.rm = TRUE), 1),
    Mean = round(mean(afc_months, na.rm = TRUE), 2),
    SD = round(sd(afc_months, na.rm = TRUE), 2),
    Median = round(median(afc_months, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(afc_summary, 
      caption = "AFC Value Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

AFC Value Categories with Min/Max/Mean
afc_category	N	Min	Max	Mean	SD	Median	Percentage
Extreme (>100 mo)	1,481	101	24,295	249.59	1,706.99	117	4.49
Late (72-100 mo)	2,495	73	100	84.47	8.11	84	7.56
Negative (ERROR)	127	-394	-1	-102.76	106.54	-56	0.38
Normal (18-72 mo)	28,229	18	72	39.36	11.89	36	85.50
Too Young (<18 mo)	683	0	17	5.74	6.30	3	2.07

# Plot 1: Full distribution with log scale
p1 <- ggplot(afc_data, aes(x = afc_months)) +
  geom_histogram(aes(fill = afc_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Too Young (<18 mo)" = "orange",
    "Normal (18-72 mo)" = "darkgreen",
    "Late (72-100 mo)" = "yellow",
    "Extreme (>100 mo)" = "purple"
  )) +
  geom_vline(xintercept = c(0, 18, 72), linetype = "dashed", color = "black", linewidth = 0.8) +
  annotate("text", x = 0, y = Inf, label = "0", vjust = 2, hjust = 0.5, size = 3) +
  annotate("text", x = 18, y = Inf, label = "18", vjust = 2, hjust = 0.5, size = 3) +
  annotate("text", x = 72, y = Inf, label = "72", vjust = 2, hjust = 0.5, size = 3) +
  labs(
    title = "Age at First Calving - Full Distribution",
    subtitle = paste0("Min = ", round(min(afc_data$afc_months), 1), " | Max = ", 
                      format(round(max(afc_data$afc_months), 1), big.mark = ","), 
                      " | Mean = ", round(mean(afc_data$afc_months), 1), " months"),
    x = "AFC (months)",
    y = "Count",
    fill = "Category"
  ) +
  theme(legend.position = "bottom") +
  scale_y_log10(labels = comma)

# Plot 2: Boxplot showing outliers
p2 <- ggplot(afc_data, aes(y = afc_months)) +
  geom_boxplot(fill = "steelblue", outlier.color = "red", outlier.size = 1) +
  labs(
    title = "AFC Boxplot - Outliers in Red",
    y = "AFC (months)"
  ) +
  coord_flip()

grid.arrange(p1, p2, ncol = 1, heights = c(2, 1))

6.2 Extreme AFC Values

# Show records with extreme AFC values
extreme_afc <- lactation %>%
  filter(parity == 1, !is.na(afc_months)) %>%
  filter(afc_months < 0 | afc_months > 100) %>%
  select(animal_id, farm_id, project_name, main_breed_name, 
         afc_months, afc_days, birthdate, calving_date) %>%
  arrange(afc_months) %>%
  head(50)

kable(extreme_afc, 
      caption = "Sample of Extreme AFC Values (Negative or > 100 months)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = TRUE) %>%
  scroll_box(height = "400px")

Sample of Extreme AFC Values (Negative or > 100 months)
animal_id	farm_id	project_name	main_breed_name	afc_months	afc_days	birthdate	calving_date
176081	514151	NA	Holstein Friesian	-394	-11956	04/09/2018	7/15/1985
175712	514149	NA	Holstein Friesian	-358	-10868	04/09/2018	07/07/1988
175675	515076	NA	Local Zebu	-344	-10445	9/20/2017	2/14/1989
187786	510092	NA	Sahiwal	-309	-9378	7/16/2016	11/12/1990
178409	515077	NA	Unknown	-306	-9297	09/10/2019	3/28/1994
195491	598285	NA	Ayrshire	-296	-8998	02/01/2019	6/14/1994
178911	598285	NA	Holstein Friesian	-292	-8848	02/01/2020	11/11/1995
170328	595412	NA	Ayrshire	-291	-8822	05/08/2018	3/13/1994
178404	598285	NA	Holstein Friesian	-288	-8748	02/01/2020	2/19/1996
170285	515160	NA	Ayrshire	-273	-8282	7/27/2016	11/23/1993
171698	616608	NA	Holstein Friesian	-273	-8278	6/15/2016	10/16/1993
176871	597275	NA	Local Zebu	-272	-8256	10/21/2010	3/14/1988
170515	510092	NA	Sahiwal	-261	-7917	01/07/2018	05/05/1996
746443	601235	KLBA	Ayrshire	-258	-7831	05/04/2005	11/25/1983
335377	347826	STANLEY AND SON LIMITED - KENYA	Boran	-257	-7801	4/28/2011	12/18/1989
339751	347826	STANLEY AND SON LIMITED - KENYA	Boran	-254	-7698	1/15/2011	12/18/1989
170557	597358	NA	Holstein Friesian	-241	-7320	04/01/2020	3/17/2000
170506	514950	NA	Local Zebu	-240	-7272	05/04/2017	06/06/1997
170315	598316	NA	Ayrshire	-238	-7223	5/18/2018	08/08/1998
171861	514852	NA	NA	-237	-7202	03/07/2017	6/18/1997
171864	514855	NA	NA	-227	-6892	03/08/2017	4/25/1998
171859	514850	NA	NA	-226	-6858	03/07/2017	5/28/1998
174152	403781	AVCD	Holstein Friesian	-224	-6790	9/21/2016	2/18/1998
171863	514854	NA	NA	-223	-6771	03/08/2017	8/24/1998
171328	515320	NA	Holstein Friesian	-216	-6545	4/20/2019	5/19/2001
203420	514952	NA	Local Zebu	-212	-6432	05/04/2017	9/24/1999
171887	515079	NA	Local Zebu	-208	-6302	09/11/2019	06/10/2002
179369	598285	NA	Holstein Friesian	-206	-6235	02/01/2021	01/07/2004
792916	601289	KALRO	Ayrshire	-206	-6261	12/01/2022	10/10/2005
171888	403011	AVCD	Unknown	-205	-6223	5/18/2019	05/04/2002
170513	515024	NA	Unknown	-204	-6175	5/29/2019	07/02/2002
813818	601252	KLBA	Holstein Friesian	-204	-6193	4/18/2012	05/05/1995
175682	510092	NA	Unknown	-200	-6051	7/16/2016	12/22/1999
171299	514952	NA	Local Zebu	-196	-5944	05/04/2017	1/24/2001
743491	601217	KLBA	Jersey	-185	-5619	09/02/1997	4/15/1982
758995	601397	KLBA	Holstein Friesian	-185	-5621	11/15/2017	6/26/2002
754668	601289	KALRO	Ayrshire	-183	-5548	8/13/2019	06/04/2004
786263	601210	KLBA	Holstein Friesian	-179	-5417	04/04/2007	06/04/1992
801205	601289	KALRO	Holstein Friesian	-169	-5127	11/28/2022	11/14/2008
170559	616901	NA	Ayrshire	-166	-5027	10/17/2016	01/12/2003
177867	616942	NA	Ayrshire	-166	-5020	10/25/2016	1/27/2003
186024	598297	NA	Holstein Friesian	-156	-4738	05/07/2020	5/18/2007
813828	601252	KLBA	Holstein Friesian	-148	-4477	02/09/2012	11/07/1999
186411	515711	NA	Ayrshire	-147	-4468	4/20/2020	1/26/2008
286489	347826	STANLEY AND SON LIMITED - KENYA	Holstein Friesian	-140	-4252	5/22/1978	9/30/1966
748626	601246	KLBA	Ayrshire	-135	-4082	03/01/2006	12/27/1994
187716	598291	NA	Ayrshire	-126	-3817	10/07/2017	4/26/2007
187128	513219	NA	Unknown	-123	-3732	08/01/2019	5/13/2009
187718	598285	NA	Holstein Friesian	-114	-3437	02/01/2020	09/04/2010
195487	598285	NA	Ayrshire	-110	-3315	5/27/2018	4/29/2009

6.3 AFC by Breed

# AFC by breed - with proper filtering
afc_by_breed <- lactation %>%
  filter(parity == 1, !is.na(afc_months), !is.na(main_breed_name),
         afc_months >= 18, afc_months <= 72) %>%  # Clean data only
  group_by(main_breed_name) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months, na.rm = TRUE), 1),
    Max = round(max(afc_months, na.rm = TRUE), 1),
    Mean = round(mean(afc_months, na.rm = TRUE), 2),
    SD = round(sd(afc_months, na.rm = TRUE), 2),
    Median = round(median(afc_months, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  filter(N >= 30) %>%
  arrange(Mean)

kable(afc_by_breed, 
      caption = "AFC by Breed (Cleaned: 18-72 months, N≥30)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

AFC by Breed (Cleaned: 18-72 months, N≥30)
main_breed_name	N	Min	Max	Mean	SD	Median
Scandinavian Red	50	18	38	26.84	4.51	25.5
Jersey crosses	70	18	52	30.11	6.17	29.5
Gir	199	21	72	31.07	7.48	30.0
Fleckvieh	55	21	62	31.49	8.06	30.0
Jersey	1,144	18	72	31.50	10.19	28.0
Holstein-Friesian crosses	1,255	18	72	35.78	7.02	35.0
Ayrshire crosses	88	18	72	36.75	10.89	34.0
Holstein Friesian	15,944	18	72	37.21	11.06	34.0
Sahiwal crosses	237	22	60	38.36	8.24	37.0
Boran	101	19	69	38.67	9.09	37.0
Unknown	43	24	71	40.09	14.00	36.0
Guernsey	574	19	72	41.44	12.62	38.0
Ayrshire	5,188	18	72	43.49	11.60	41.0
Brown Swiss	147	19	72	44.18	13.94	42.0
Sahiwal	2,875	18	72	49.27	11.44	49.0

# Plot
ggplot(afc_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD), width = 0.2, color = "black") +
  geom_text(aes(label = paste0(round(Mean, 1), "\n(n=", N, ")")), 
            hjust = -0.1, size = 3) +
  coord_flip() +
  labs(
    title = "Mean Age at First Calving by Breed",
    subtitle = "Error bars = ±1 SD | Filtered: 18-72 months",
    x = "Breed",
    y = "Mean AFC (months)"
  ) +
  scale_y_continuous(limits = c(0, max(afc_by_breed$Mean + afc_by_breed$SD, na.rm = TRUE) * 1.3))

7 Daily Milk Yield Issues

7.1 Distribution with Outliers

# Daily yield categories
dmy_data <- milking %>%
  filter(!is.na(daily_yield)) %>%
  mutate(
    yield_category = case_when(
      daily_yield < 0 ~ "Negative (ERROR)",
      daily_yield == 0 ~ "Zero Yield",
      daily_yield > 0 & daily_yield < 0.5 ~ "Very Low (<0.5 kg)",
      daily_yield >= 0.5 & daily_yield <= 30 ~ "Normal (0.5-30 kg)",
      daily_yield > 30 & daily_yield <= 50 ~ "High (30-50 kg)",
      daily_yield > 50 ~ "Extreme (>50 kg)"
    )
  )

# Summary with Min, Max, Mean
dmy_summary <- dmy_data %>%
  group_by(yield_category) %>%
  summarise(
    N = n(),
    Min = round(min(daily_yield, na.rm = TRUE), 2),
    Max = round(max(daily_yield, na.rm = TRUE), 2),
    Mean = round(mean(daily_yield, na.rm = TRUE), 2),
    SD = round(sd(daily_yield, na.rm = TRUE), 2),
    Median = round(median(daily_yield, na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(dmy_summary, 
      caption = "Daily Milk Yield Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Daily Milk Yield Categories with Min/Max/Mean
yield_category	N	Min	Max	Mean	SD	Median	Percentage
High (30-50 kg)	1,936	30.10	50.00	32.68	2.83	31.9	0.18
Normal (0.5-30 kg)	673,257	0.50	30.00	8.47	6.96	8.1	64.21
Very Low (<0.5 kg)	40,284	0.01	0.49	0.28	0.12	0.3	3.84
Zero Yield	333,069	0.00	0.00	0.00	0.00	0.0	31.76

# Plot 1: Histogram
p1 <- ggplot(dmy_data, aes(x = daily_yield)) +
  geom_histogram(aes(fill = yield_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Zero Yield" = "gray40",
    "Very Low (<0.5 kg)" = "orange",
    "Normal (0.5-30 kg)" = "darkgreen",
    "High (30-50 kg)" = "blue",
    "Extreme (>50 kg)" = "purple"
  )) +
  labs(
    title = "Daily Milk Yield - Full Distribution",
    subtitle = paste0("Min = ", round(min(dmy_data$daily_yield), 2), " | Max = ", 
                      round(max(dmy_data$daily_yield), 2), " | Mean = ", 
                      round(mean(dmy_data$daily_yield), 2), " kg"),
    x = "Daily Yield (kg)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

# Plot 2: Focus on normal range with cleaned mean
clean_dmy <- dmy_data %>% filter(daily_yield >= 0.5, daily_yield <= 50)
p2 <- ggplot(clean_dmy, aes(x = daily_yield)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black", linewidth = 0.1) +
  geom_vline(xintercept = mean(clean_dmy$daily_yield), 
             linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(clean_dmy$daily_yield), y = Inf, 
           label = paste0("Mean = ", round(mean(clean_dmy$daily_yield), 2), " kg"),
           vjust = 2, hjust = -0.1, color = "red", size = 4) +
  labs(
    title = "Daily Milk Yield - Cleaned (0.5-50 kg)",
    subtitle = paste0("N = ", format(nrow(clean_dmy), big.mark = ","), 
                      " | Mean = ", round(mean(clean_dmy$daily_yield), 2),
                      " | SD = ", round(sd(clean_dmy$daily_yield), 2), " kg"),
    x = "Daily Yield (kg)",
    y = "Count"
  ) +
  scale_y_continuous(labels = comma)

# Plot 3: Boxplot
p3 <- ggplot(dmy_data, aes(y = daily_yield)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.size = 0.5) +
  labs(title = "Daily Yield Boxplot", y = "Daily Yield (kg)") +
  coord_flip()

grid.arrange(p1, p2, p3, ncol = 1, heights = c(2, 2, 1))

7.2 Daily Yield by Breed

# Daily yield by breed - cleaned
dmy_by_breed <- milking %>%
  filter(!is.na(daily_yield), !is.na(main_breed_name),
         daily_yield >= 0.5, daily_yield <= 50) %>%
  group_by(main_breed_name) %>%
  summarise(
    N_Records = n(),
    N_Animals = n_distinct(animal_id),
    Min = round(min(daily_yield, na.rm = TRUE), 2),
    Max = round(max(daily_yield, na.rm = TRUE), 2),
    Mean = round(mean(daily_yield, na.rm = TRUE), 2),
    SD = round(sd(daily_yield, na.rm = TRUE), 2),
    Median = round(median(daily_yield, na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  filter(N_Records >= 500) %>%
  arrange(desc(Mean))

kable(dmy_by_breed, 
      caption = "Daily Milk Yield by Breed (Cleaned: 0.5-50 kg, N≥500)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Daily Milk Yield by Breed (Cleaned: 0.5-50 kg, N≥500)
main_breed_name	N_Records	N_Animals	Min	Max	Mean	SD	Median
Jersey crosses	25,699	55	0.5	31.4	14.61	4.97	14.20
Gir	82,442	184	0.5	39.5	14.14	5.28	13.60
Ayrshire crosses	16,660	41	0.5	34.0	13.90	4.83	13.70
Sahiwal crosses	24,438	41	1.0	37.1	13.22	5.23	12.65
Holstein-Friesian crosses	69,743	394	0.8	37.5	13.19	5.40	12.20
Scandinavian Red	23,510	49	1.0	34.0	12.84	4.50	12.30
Brown Swiss	21,789	126	0.5	36.0	12.54	6.59	12.30
Girolando	12,472	22	1.4	31.1	12.30	4.53	12.30
Fleckvieh	16,331	46	0.5	28.9	12.05	4.54	11.60
Boran	3,280	39	0.5	25.2	10.44	4.39	9.60
Holstein Friesian	250,367	9,107	0.5	50.0	5.56	6.38	2.00
Jersey	14,583	537	0.5	30.0	5.12	6.01	1.42
Sahiwal	26,437	1,422	0.5	30.0	3.26	3.69	2.00
Ayrshire	74,726	3,557	0.5	36.3	2.83	3.94	1.37
Guernsey	8,131	419	0.5	27.6	1.74	2.45	0.95

# Plot
ggplot(dmy_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
  geom_bar(stat = "identity", fill = "darkgreen", width = 0.7) +
  geom_errorbar(aes(ymin = Mean - SD/sqrt(N_Records)*1.96, 
                    ymax = Mean + SD/sqrt(N_Records)*1.96), 
                width = 0.2, color = "black") +
  geom_text(aes(label = paste0(round(Mean, 1), " kg")), 
            hjust = -0.1, size = 3) +
  coord_flip() +
  labs(
    title = "Mean Daily Milk Yield by Breed",
    subtitle = "Error bars = 95% CI | Filtered: 0.5-50 kg",
    x = "Breed",
    y = "Mean Daily Yield (kg)"
  ) +
  scale_y_continuous(limits = c(0, max(dmy_by_breed$Mean, na.rm = TRUE) * 1.3))

8 Days in Milk (DIM) Issues

8.1 Distribution

# DIM categories
dim_data <- milking %>%
  filter(!is.na(dim)) %>%
  mutate(
    dim_category = case_when(
      dim < 0 ~ "Negative (ERROR)",
      dim == 0 ~ "Zero DIM",
      dim >= 1 & dim <= 5 ~ "Very Early (1-5 d)",
      dim > 5 & dim <= 305 ~ "Normal (5-305 d)",
      dim > 305 & dim <= 500 ~ "Extended (305-500 d)",
      dim > 500 & dim <= 1000 ~ "Very Extended (500-1000 d)",
      dim > 1000 ~ "Extreme (>1000 d)"
    )
  )

# Summary
dim_summary <- dim_data %>%
  group_by(dim_category) %>%
  summarise(
    N = n(),
    Min = min(dim, na.rm = TRUE),
    Max = max(dim, na.rm = TRUE),
    Mean = round(mean(dim, na.rm = TRUE), 1),
    SD = round(sd(dim, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(dim_summary, 
      caption = "Days in Milk Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Days in Milk Categories with Min/Max/Mean
dim_category	N	Min	Max	Mean	SD	Percentage
Extended (305-500 d)	117,863	306	500	376.9	55.0	11.24
Extreme (>1000 d)	22,267	1,001	7,752	1,596.5	718.3	2.12
Negative (ERROR)	10	-729,939	-298,437	-633,728.4	119,639.4	0.00
Normal (5-305 d)	828,097	6	305	142.5	83.9	78.97
Very Early (1-5 d)	19,458	1	5	4.3	1.1	1.86
Very Extended (500-1000 d)	57,618	501	1,000	682.7	138.3	5.49
Zero DIM	3,262	0	0	0.0	0.0	0.31

# Plot
p1 <- ggplot(dim_data, aes(x = dim)) +
  geom_histogram(aes(fill = dim_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Zero DIM" = "gray40",
    "Very Early (1-5 d)" = "orange",
    "Normal (5-305 d)" = "darkgreen",
    "Extended (305-500 d)" = "yellow",
    "Very Extended (500-1000 d)" = "purple",
    "Extreme (>1000 d)" = "black"
  )) +
  labs(
    title = "Days in Milk - Full Distribution",
    subtitle = paste0("Min = ", format(min(dim_data$dim), big.mark = ","), 
                      " | Max = ", format(max(dim_data$dim), big.mark = ","),
                      " (Note extreme negative!)"),
    x = "DIM (days)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

print(p1)

9 Total Lactation Yield Issues

9.1 Distribution

# TLY data
tly_data <- lactation %>%
  filter(!is.na(total_lactation_yield)) %>%
  mutate(
    tly_category = case_when(
      total_lactation_yield < 0 ~ "Negative (ERROR)",
      total_lactation_yield == 0 ~ "Zero Yield",
      total_lactation_yield > 0 & total_lactation_yield < 100 ~ "Very Low (<100 kg)",
      total_lactation_yield >= 100 & total_lactation_yield <= 5000 ~ "Normal (100-5000 kg)",
      total_lactation_yield > 5000 & total_lactation_yield <= 10000 ~ "High (5000-10000 kg)",
      total_lactation_yield > 10000 & total_lactation_yield <= 15000 ~ "Very High (10-15k kg)",
      total_lactation_yield > 15000 ~ "Extreme (>15000 kg)"
    )
  )

# Summary
tly_summary <- tly_data %>%
  group_by(tly_category) %>%
  summarise(
    N = n(),
    Min = round(min(total_lactation_yield, na.rm = TRUE), 1),
    Max = round(max(total_lactation_yield, na.rm = TRUE), 1),
    Mean = round(mean(total_lactation_yield, na.rm = TRUE), 1),
    SD = round(sd(total_lactation_yield, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(tly_summary, 
      caption = "Total Lactation Yield Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Total Lactation Yield Categories with Min/Max/Mean
tly_category	N	Min	Max	Mean	SD	Percentage
Extreme (>15000 kg)	9	15,205.2	23,186.4	18,090.9	2,309.1	0.01
High (5000-10000 kg)	361	5,004.2	9,979.2	6,615.5	1,290.3	0.41
Normal (100-5000 kg)	9,199	100.0	4,999.0	694.1	953.7	10.41
Very High (10-15k kg)	39	10,000.7	14,973.5	11,598.2	1,589.4	0.04
Very Low (<100 kg)	27,585	0.0	99.9	18.5	19.0	31.22
Zero Yield	51,154	0.0	0.0	0.0	0.0	57.90

# Major issue: Many zero yields!
zero_tly_pct <- round(100 * sum(tly_data$total_lactation_yield == 0) / nrow(tly_data), 1)

# Plot
p1 <- ggplot(tly_data, aes(x = total_lactation_yield)) +
  geom_histogram(aes(fill = tly_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Zero Yield" = "gray40",
    "Very Low (<100 kg)" = "orange",
    "Normal (100-5000 kg)" = "darkgreen",
    "High (5000-10000 kg)" = "blue",
    "Very High (10-15k kg)" = "purple",
    "Extreme (>15000 kg)" = "black"
  )) +
  labs(
    title = "Total Lactation Yield - Full Distribution",
    subtitle = paste0("WARNING: ", zero_tly_pct, "% of records have ZERO yield!"),
    x = "Total Lactation Yield (kg)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

# Non-zero distribution with mean
tly_nonzero <- tly_data %>% filter(total_lactation_yield > 0, total_lactation_yield <= 15000)
p2 <- ggplot(tly_nonzero, aes(x = total_lactation_yield)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black") +
  geom_vline(xintercept = mean(tly_nonzero$total_lactation_yield), 
             linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(tly_nonzero$total_lactation_yield), y = Inf,
           label = paste0("Mean = ", round(mean(tly_nonzero$total_lactation_yield), 1), " kg"),
           vjust = 2, hjust = -0.1, color = "red") +
  labs(
    title = "Total Lactation Yield - Non-Zero Values Only",
    subtitle = paste0("N = ", format(nrow(tly_nonzero), big.mark = ","),
                      " | Mean = ", round(mean(tly_nonzero$total_lactation_yield), 1),
                      " | SD = ", round(sd(tly_nonzero$total_lactation_yield), 1), " kg"),
    x = "Total Lactation Yield (kg)",
    y = "Count"
  )

grid.arrange(p1, p2, ncol = 1)

10 Calving Interval Issues

10.1 Distribution

# CI data (parity > 1)
ci_data <- lactation %>%
  filter(parity > 1, !is.na(calving_interval_days)) %>%
  mutate(
    ci_category = case_when(
      calving_interval_days < 0 ~ "Negative (ERROR)",
      calving_interval_days >= 0 & calving_interval_days < 300 ~ "Too Short (<300 d)",
      calving_interval_days >= 300 & calving_interval_days <= 450 ~ "Optimal (300-450 d)",
      calving_interval_days > 450 & calving_interval_days <= 600 ~ "Long (450-600 d)",
      calving_interval_days > 600 & calving_interval_days <= 730 ~ "Very Long (600-730 d)",
      calving_interval_days > 730 ~ "Extreme (>730 d / 2 yrs)"
    )
  )

# Summary
ci_summary <- ci_data %>%
  group_by(ci_category) %>%
  summarise(
    N = n(),
    Min = round(min(calving_interval_days, na.rm = TRUE), 1),
    Max = round(max(calving_interval_days, na.rm = TRUE), 1),
    Mean = round(mean(calving_interval_days, na.rm = TRUE), 1),
    SD = round(sd(calving_interval_days, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(ci_summary, 
      caption = "Calving Interval Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Calving Interval Categories with Min/Max/Mean
ci_category	N	Min	Max	Mean	SD	Percentage
Extreme (>730 d / 2 yrs)	6,784	731	16,530	1,135.2	595.6	13.44
Long (450-600 d)	11,011	451	600	512.1	41.8	21.82
Optimal (300-450 d)	27,179	300	450	377.6	36.5	53.86
Too Short (<300 d)	1,761	0	299	151.7	108.3	3.49
Very Long (600-730 d)	3,728	601	730	659.3	37.6	7.39

# Plots
p1 <- ggplot(ci_data, aes(x = calving_interval_days)) +
  geom_histogram(aes(fill = ci_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Too Short (<300 d)" = "orange",
    "Optimal (300-450 d)" = "darkgreen",
    "Long (450-600 d)" = "yellow",
    "Very Long (600-730 d)" = "purple",
    "Extreme (>730 d / 2 yrs)" = "black"
  )) +
  labs(
    title = "Calving Interval - Full Distribution",
    subtitle = paste0("Min = ", round(min(ci_data$calving_interval_days), 1), 
                      " | Max = ", format(round(max(ci_data$calving_interval_days), 1), big.mark = ","),
                      " | Mean = ", round(mean(ci_data$calving_interval_days), 1), " days"),
    x = "Calving Interval (days)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

print(p1)

11 Days Open Issues

11.1 Distribution

# Days open data
do_data <- lactation %>%
  filter(!is.na(days_open)) %>%
  mutate(
    do_category = case_when(
      days_open < 0 ~ "Negative (ERROR)",
      days_open >= 0 & days_open < 21 ~ "Too Short (<21 d)",
      days_open >= 21 & days_open <= 120 ~ "Optimal (21-120 d)",
      days_open > 120 & days_open <= 200 ~ "Acceptable (120-200 d)",
      days_open > 200 & days_open <= 365 ~ "Long (200-365 d)",
      days_open > 365 ~ "Extreme (>365 d)"
    )
  )

# Summary
do_summary <- do_data %>%
  group_by(do_category) %>%
  summarise(
    N = n(),
    Min = round(min(days_open, na.rm = TRUE), 1),
    Max = round(max(days_open, na.rm = TRUE), 1),
    Mean = round(mean(days_open, na.rm = TRUE), 1),
    SD = round(sd(days_open, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  mutate(Percentage = round(100 * N / sum(N), 2))

kable(do_summary, 
      caption = "Days Open Categories with Min/Max/Mean",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Days Open Categories with Min/Max/Mean
do_category	N	Min	Max	Mean	SD	Percentage
Acceptable (120-200 d)	11,082	121	200	155.7	22.8	21.85
Extreme (>365 d)	9,053	366	669,062	821.1	7,046.7	17.85
Long (200-365 d)	9,364	201	365	266.3	45.8	18.47
Negative (ERROR)	1,531	-280	-1	-159.2	97.2	3.02
Optimal (21-120 d)	19,353	21	120	79.0	24.0	38.16
Too Short (<21 d)	327	0	20	11.6	6.0	0.64

# Plot
p1 <- ggplot(do_data, aes(x = days_open)) +
  geom_histogram(aes(fill = do_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "red",
    "Too Short (<21 d)" = "orange",
    "Optimal (21-120 d)" = "darkgreen",
    "Acceptable (120-200 d)" = "lightgreen",
    "Long (200-365 d)" = "yellow",
    "Extreme (>365 d)" = "purple"
  )) +
  labs(
    title = "Days Open - Full Distribution",
    subtitle = paste0("Min = ", round(min(do_data$days_open), 1), " (NEGATIVE!) | Max = ", 
                      format(round(max(do_data$days_open), 1), big.mark = ",")),
    x = "Days Open",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  scale_x_continuous(limits = c(-500, 2000)) +
  theme(legend.position = "bottom")

print(p1)

12 Combined Outlier Scatter Plots

12.1 AFC vs Daily Yield

# Merge data for scatter plots
scatter_data <- milking %>%
  filter(!is.na(afc_months), !is.na(daily_yield)) %>%
  filter(afc_months > -500, afc_months < 500, daily_yield >= 0, daily_yield <= 60)

# Sample if too large
if(nrow(scatter_data) > 50000) {
  scatter_data <- scatter_data %>% sample_n(50000)
}

ggplot(scatter_data, aes(x = afc_months, y = daily_yield)) +
  geom_point(alpha = 0.3, size = 0.5, color = "steelblue") +
  geom_vline(xintercept = c(0, 18, 72), linetype = "dashed", color = "red") +
  geom_hline(yintercept = c(0, 50), linetype = "dashed", color = "red") +
  annotate("rect", xmin = 18, xmax = 72, ymin = 0.5, ymax = 50, 
           alpha = 0.1, fill = "green") +
  labs(
    title = "AFC vs Daily Milk Yield - Outlier Detection",
    subtitle = "Green zone = expected normal range; Red lines = thresholds",
    x = "Age at First Calving (months)",
    y = "Daily Milk Yield (kg)"
  )

12.2 DIM vs Daily Yield

scatter_dim <- milking %>%
  filter(!is.na(dim), !is.na(daily_yield)) %>%
  filter(dim > -1000, dim < 1000, daily_yield >= 0, daily_yield <= 60)

# Sample if too large
if(nrow(scatter_dim) > 50000) {
  scatter_dim <- scatter_dim %>% sample_n(50000)
}

ggplot(scatter_dim, aes(x = dim, y = daily_yield)) +
  geom_point(alpha = 0.3, size = 0.5, color = "darkgreen") +
  geom_vline(xintercept = c(0, 305, 500), linetype = "dashed", color = "red") +
  geom_hline(yintercept = c(0, 50), linetype = "dashed", color = "red") +
  annotate("rect", xmin = 5, xmax = 305, ymin = 0.5, ymax = 40, 
           alpha = 0.1, fill = "green") +
  labs(
    title = "Days in Milk vs Daily Yield - Outlier Detection",
    subtitle = "Green zone = expected normal range",
    x = "Days in Milk (DIM)",
    y = "Daily Milk Yield (kg)"
  )

13 Summary and Recommendations

13.1 Data Quality Summary

# Create final summary table with proper NA handling
quality_summary <- data.frame(
  Trait = c("Age at First Calving", "Calving Interval", "Total Lactation Yield",
            "Daily Milk Yield", "Days in Milk", "Days Open", "Parity"),
  
  N_Total = c(
    sum(!is.na(lactation$afc_months) & lactation$parity == 1),
    sum(!is.na(lactation$calving_interval_days) & lactation$parity > 1),
    sum(!is.na(lactation$total_lactation_yield)),
    sum(!is.na(milking$daily_yield)),
    sum(!is.na(milking$dim)),
    sum(!is.na(lactation$days_open)),
    sum(!is.na(lactation$parity))
  ),
  
  Min_Value = c(
    round(min(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(min(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(min(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(min(milking$daily_yield, na.rm = TRUE), 2),
    round(min(milking$dim, na.rm = TRUE), 0),
    round(min(lactation$days_open, na.rm = TRUE), 1),
    round(min(lactation$parity, na.rm = TRUE), 0)
  ),
  
  Max_Value = c(
    round(max(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
    round(max(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
    round(max(lactation$total_lactation_yield, na.rm = TRUE), 1),
    round(max(milking$daily_yield, na.rm = TRUE), 2),
    round(max(milking$dim, na.rm = TRUE), 0),
    round(max(lactation$days_open, na.rm = TRUE), 1),
    round(max(lactation$parity, na.rm = TRUE), 0)
  ),
  
  Raw_Mean = c(
    round(mean(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 2),
    round(mean(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 2),
    round(mean(lactation$total_lactation_yield, na.rm = TRUE), 2),
    round(mean(milking$daily_yield, na.rm = TRUE), 2),
    round(mean(milking$dim, na.rm = TRUE), 2),
    round(mean(lactation$days_open, na.rm = TRUE), 2),
    round(mean(lactation$parity, na.rm = TRUE), 2)
  ),
  
  Recommended_Range = c(
    "18-72 months",
    "300-700 days",
    "100-15,000 kg",
    "0.5-50 kg",
    "5-500 days",
    "21-400 days",
    "1-12"
  ),
  
  Problem_Count = c(
    sum((lactation$afc_months < 0 | lactation$afc_months > 100) & lactation$parity == 1, na.rm = TRUE),
    sum((lactation$calving_interval_days < 280 | lactation$calving_interval_days > 1000) & lactation$parity > 1, na.rm = TRUE),
    sum(lactation$total_lactation_yield == 0 | lactation$total_lactation_yield > 20000, na.rm = TRUE),
    sum(milking$daily_yield <= 0 | milking$daily_yield > 50, na.rm = TRUE),
    sum(milking$dim < 0 | milking$dim > 1000, na.rm = TRUE),
    sum(lactation$days_open < 0 | lactation$days_open > 500, na.rm = TRUE),
    sum(lactation$parity < 1 | lactation$parity > 15, na.rm = TRUE)
  )
)

# Calculate percentages
quality_summary <- quality_summary %>%
  mutate(
    Problem_Pct = round(100 * Problem_Count / N_Total, 2),
    Quality_Status = case_when(
      Problem_Pct < 1 ~ "Good",
      Problem_Pct < 5 ~ "Minor Issues",
      Problem_Pct < 20 ~ "Moderate Issues",
      TRUE ~ "Major Issues"
    )
  )

kable(quality_summary, 
      caption = "Data Quality Summary with Min/Max Values",
      col.names = c("Trait", "N Total", "Min", "Max", "Raw Mean", 
                    "Recommended Range", "Problem N", "Problem %", "Status"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

Data Quality Summary with Min/Max Values
Trait	N Total	Min	Max	Raw Mean	Recommended Range	Problem N	Problem %	Status
Age at First Calving	NA	-394	24,295.0	50.96	18-72 months	1,608	NA	Major Issues
Calving Interval	NA	0	16,530.0	521.70	300-700 days	4,529	NA	Major Issues
Total Lactation Yield	88,347	0	23,186.4	112.02	100-15,000 kg	51,155	57.90	Major Issues
Daily Milk Yield	1,048,546	0	50.0	5.51	0.5-50 kg	333,069	31.76	Major Issues
Days in Milk	1,048,575	-729,939	7,752.0	220.35	5-500 days	22,277	2.12	Minor Issues
Days Open	50,710	-280	669,062.0	255.21	21-400 days	7,307	14.41	Moderate Issues
Parity	87,990	1	32.0	2.42	1-12	39	0.04	Good

13.2 Key Findings

Summary of Outliers Detected:

Trait	Min Value	Max Value	Issue
AFC	-394 months	24,295 months	Negative values & extreme highs
DIM	-729,939 days	7,752 days	Extreme negative values
Days Open	-280 days	669,062 days	Negative values
Total Lact. Yield	0 kg	23,186.4 kg	57.9% zeros

13.3 Suggested Data Cleaning Filters

# Lactation data cleaning
lactation_clean <- lactation %>%
  filter(
    is.na(afc_months) | (afc_months >= 18 & afc_months <= 72),
    is.na(calving_interval_days) | (calving_interval_days >= 300 & calving_interval_days <= 700),
    total_lactation_yield > 0 & total_lactation_yield <= 15000,
    is.na(days_open) | (days_open >= 21 & days_open <= 400),
    parity >= 1 & parity <= 12
  )

# Milking data cleaning
milking_clean <- milking %>%
  filter(
    dim >= 5 & dim <= 500,
    daily_yield >= 0.5 & daily_yield <= 50,
    is.na(afc_months) | (afc_months >= 18 & afc_months <= 72),
    is.na(parity) | (parity >= 1 & parity <= 12)
  )

14 Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_Kenya.utf8  LC_CTYPE=English_Kenya.utf8   
## [3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=English_Kenya.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5     viridisLite_0.4.2 kableExtra_1.4.0  knitr_1.50       
##  [5] gridExtra_2.3     scales_1.4.0      ggplot2_3.5.2     tidyr_1.3.1      
##  [9] dplyr_1.1.4       data.table_1.17.8
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] xml2_1.4.0         stringr_1.5.1      jquerylib_0.1.4    textshaping_1.0.2 
##  [9] systemfonts_1.2.3  yaml_2.3.10        fastmap_1.2.0      R6_2.6.1          
## [13] labeling_0.4.3     generics_0.1.4     tibble_3.3.0       svglite_2.2.1     
## [17] bslib_0.9.0        pillar_1.11.0      RColorBrewer_1.1-3 rlang_1.1.6       
## [21] stringi_1.8.7      cachem_1.1.0       xfun_0.52          sass_0.4.10       
## [25] cli_3.6.5          withr_3.0.2        magrittr_2.0.3     digest_0.6.37     
## [29] grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
## [37] purrr_1.1.0        tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1

Report generated on 05 December 2025 at 04:48

Kenya ADGG/TDGG Dairy Data Quality Assessment

Outlier Detection & Data Anomaly Report

Setegn Worku Alemu - ILRI

05 December 2025